1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 32 * ================================================================ 33 * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 34 * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 35 * one or more CPUs and some local memory. The CPUs in each node can access 36 * the memory in the other nodes but at a higher latency than accessing their 37 * local memory. Typically, a system with only one node has Uniform Memory 38 * Access (UMA), but it may be possible to have a one node system that has 39 * some global memory outside of the node which is higher latency. 40 * 41 * Module Description 42 * ------------------ 43 * This module provides a platform interface for determining which CPUs and 44 * which memory (and how much) are in a NUMA node and how far each node is from 45 * each other. The interface is used by the Virtual Memory (VM) system and the 46 * common lgroup framework. The VM system uses the plat_*() routines to fill 47 * in its memory node (memnode) array with the physical address range spanned 48 * by each NUMA node to know which memory belongs to which node, so it can 49 * build and manage a physical page free list for each NUMA node and allocate 50 * local memory from each node as needed. The common lgroup framework uses the 51 * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 52 * to each node (leaf lgroup) and how far each node is from each other, so it 53 * can build the latency (lgroup) topology for the machine in order to optimize 54 * for locality. Also, an lgroup platform handle instead of lgroups are used 55 * in the interface with this module, so this module shouldn't need to know 56 * anything about lgroups. Instead, it just needs to know which CPUs, memory, 57 * etc. are in each NUMA node, how far each node is from each other, and to use 58 * a unique lgroup platform handle to refer to each node through the interface. 59 * 60 * Determining NUMA Configuration 61 * ------------------------------ 62 * By default, this module will try to determine the NUMA configuration of the 63 * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 64 * Locality Information Table (SLIT). The SRAT contains info to tell which 65 * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 66 * is a matrix that gives the distance between each system locality (which is 67 * a NUMA node and should correspond to proximity domains in the SRAT). For 68 * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 69 * specification. 70 * 71 * If the SRAT doesn't exist on a system with AMD Opteron processors, we 72 * examine registers in PCI configuration space to determine how many nodes are 73 * in the system and which CPUs and memory are in each node. 74 * do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is not guaranteed to work or be compatible across all 78 * Opteron processor families. 79 * 80 * If the SLIT does not exist or look right, the kernel will probe to determine 81 * the distance between nodes as long as the NUMA CPU and memory configuration 82 * has been determined (see lgrp_plat_probe() for details). 83 * 84 * Data Structures 85 * --------------- 86 * The main data structures used by this code are the following: 87 * 88 * - lgrp_plat_cpu_node[] CPU to node ID mapping table indexed by 89 * CPU ID (only used for SRAT) 90 * 91 * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 92 * different nodes indexed by node ID 93 * 94 * - lgrp_plat_node_cnt Number of NUMA nodes in system 95 * 96 * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 97 * table indexed by node ID (only used 98 * for SRAT) 99 * 100 * - lgrp_plat_node_memory[] Table with physical address range for 101 * each node indexed by node ID 102 * 103 * The code is implemented to make the following always be true: 104 * 105 * lgroup platform handle == node ID == memnode ID 106 * 107 * Moreover, it allows for the proximity domain ID to be equal to all of the 108 * above as long as the proximity domains IDs are numbered from 0 to <number of 109 * nodes - 1>. This is done by hashing each proximity domain ID into the range 110 * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 111 * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 112 * and be assigned node ID N. If the proximity domain IDs aren't numbered 113 * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 114 * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 115 * to node IDs. However, the proximity domain IDs may not map to the 116 * equivalent node ID since we want to keep the node IDs numbered from 0 to 117 * <number of nodes - 1> to minimize cost of searching and potentially space. 118 */ 119 120 121 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 122 #include <sys/bootconf.h> 123 #include <sys/cmn_err.h> 124 #include <sys/controlregs.h> 125 #include <sys/cpupart.h> 126 #include <sys/cpuvar.h> 127 #include <sys/lgrp.h> 128 #include <sys/machsystm.h> 129 #include <sys/memlist.h> 130 #include <sys/memnode.h> 131 #include <sys/mman.h> 132 #include <sys/pci_cfgspace.h> 133 #include <sys/pci_impl.h> 134 #include <sys/param.h> 135 #include <sys/pghw.h> 136 #include <sys/promif.h> /* for prom_printf() */ 137 #include <sys/sysmacros.h> 138 #include <sys/systm.h> 139 #include <sys/thread.h> 140 #include <sys/types.h> 141 #include <sys/var.h> 142 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 143 #include <vm/hat_i86.h> 144 #include <vm/seg_kmem.h> 145 #include <vm/vm_dep.h> 146 147 #include "acpi_fw.h" /* for SRAT and SLIT */ 148 149 150 #define MAX_NODES 8 151 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 152 153 /* 154 * Constants for configuring probing 155 */ 156 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 157 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 158 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 159 160 /* 161 * Flags for probing 162 */ 163 #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 164 #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 165 #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 166 167 /* 168 * Hash proximity domain ID into node to domain mapping table using to minimize 169 * span of entries used 170 */ 171 #define NODE_DOMAIN_HASH(domain) ((domain) % lgrp_plat_node_cnt) 172 173 174 /* 175 * CPU to node ID mapping structure (only used with SRAT) 176 */ 177 typedef struct cpu_node_map { 178 int exists; 179 uint_t node; 180 uint32_t apicid; 181 uint32_t prox_domain; 182 } cpu_node_map_t; 183 184 /* 185 * Latency statistics 186 */ 187 typedef struct lgrp_plat_latency_stats { 188 hrtime_t latencies[MAX_NODES][MAX_NODES]; 189 hrtime_t latency_max; 190 hrtime_t latency_min; 191 } lgrp_plat_latency_stats_t; 192 193 /* 194 * Memory configuration for probing 195 */ 196 typedef struct lgrp_plat_probe_mem_config { 197 size_t probe_memsize; /* how much memory to probe per node */ 198 caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 199 pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 200 } lgrp_plat_probe_mem_config_t; 201 202 /* 203 * Statistics kept for probing 204 */ 205 typedef struct lgrp_plat_probe_stats { 206 hrtime_t flush_cost; 207 hrtime_t probe_cost; 208 hrtime_t probe_cost_total; 209 hrtime_t probe_error_code; 210 hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 211 int probe_suspect[MAX_NODES][MAX_NODES]; 212 hrtime_t probe_max[MAX_NODES][MAX_NODES]; 213 hrtime_t probe_min[MAX_NODES][MAX_NODES]; 214 } lgrp_plat_probe_stats_t; 215 216 /* 217 * Node to proximity domain ID mapping structure (only used with SRAT) 218 */ 219 typedef struct node_domain_map { 220 int exists; 221 uint32_t prox_domain; 222 } node_domain_map_t; 223 224 /* 225 * Node ID and starting and ending page for physical memory in node 226 */ 227 typedef struct node_phys_addr_map { 228 pfn_t start; 229 pfn_t end; 230 int exists; 231 uint32_t prox_domain; 232 } node_phys_addr_map_t; 233 234 /* 235 * Error code from processing CPU to APIC ID array boot property 236 */ 237 static int lgrp_plat_cpu_apicid_error = 0; 238 239 /* 240 * CPU to node ID mapping table (only used for SRAT) 241 */ 242 static cpu_node_map_t lgrp_plat_cpu_node[NCPU]; 243 244 /* 245 * Latency statistics 246 */ 247 lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 248 249 /* 250 * Whether memory is interleaved across nodes causing MPO to be disabled 251 */ 252 static int lgrp_plat_mem_intrlv = 0; 253 254 /* 255 * Node ID to proximity domain ID mapping table (only used for SRAT) 256 */ 257 static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 258 259 /* 260 * Physical address range for memory in each node 261 */ 262 static node_phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 263 264 /* 265 * Statistics gotten from probing 266 */ 267 static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 268 269 /* 270 * Memory configuration for probing 271 */ 272 static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 273 274 /* 275 * Error code from processing ACPI SRAT 276 */ 277 static int lgrp_plat_srat_error = 0; 278 279 /* 280 * Error code from processing ACPI SLIT 281 */ 282 static int lgrp_plat_slit_error = 0; 283 284 /* 285 * Allocate lgroup array statically 286 */ 287 static lgrp_t lgrp_space[NLGRP]; 288 static int nlgrps_alloc; 289 290 291 /* 292 * Number of nodes in system 293 */ 294 uint_t lgrp_plat_node_cnt = 1; 295 296 /* 297 * Configuration Parameters for Probing 298 * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 299 * operation, etc. 300 * - lgrp_plat_probe_nrounds How many rounds of probing to do 301 * - lgrp_plat_probe_nsamples Number of samples to take when probing each 302 * node 303 * - lgrp_plat_probe_nreads Number of times to read vendor ID from 304 * Northbridge for each probe 305 */ 306 uint_t lgrp_plat_probe_flags = 0; 307 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 308 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 309 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 310 311 /* 312 * Enable use of ACPI System Resource Affinity Table (SRAT) and System 313 * Locality Information Table (SLIT) 314 */ 315 int lgrp_plat_srat_enable = 1; 316 int lgrp_plat_slit_enable = 1; 317 318 /* 319 * Static array to hold lgroup statistics 320 */ 321 struct lgrp_stats lgrp_stats[NLGRP]; 322 323 324 /* 325 * Forward declarations of platform interface routines 326 */ 327 void plat_build_mem_nodes(struct memlist *list); 328 329 int plat_lgrphand_to_mem_node(lgrp_handle_t hand); 330 331 lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 332 333 int plat_mnode_xcheck(pfn_t pfncnt); 334 335 int plat_pfn_to_mem_node(pfn_t pfn); 336 337 /* 338 * Forward declarations of lgroup platform interface routines 339 */ 340 lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 341 342 void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 343 344 lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 345 346 void lgrp_plat_init(void); 347 348 int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 349 350 void lgrp_plat_main_init(void); 351 352 int lgrp_plat_max_lgrps(void); 353 354 pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 355 lgrp_mem_query_t query); 356 357 lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 358 359 void lgrp_plat_probe(void); 360 361 lgrp_handle_t lgrp_plat_root_hand(void); 362 363 364 /* 365 * Forward declarations of local routines 366 */ 367 static int is_opteron(void); 368 369 static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 370 cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain); 371 372 static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node); 373 374 static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 375 uint32_t domain); 376 377 static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 378 lgrp_plat_latency_stats_t *lat_stats, 379 lgrp_plat_probe_stats_t *probe_stats); 380 381 static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 382 lgrp_plat_latency_stats_t *lat_stats); 383 384 static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 385 386 static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 387 uint32_t domain); 388 389 static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 390 node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, 391 uint32_t domain); 392 393 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 394 lgrp_plat_probe_mem_config_t *probe_mem_config, 395 lgrp_plat_latency_stats_t *lat_stats, 396 lgrp_plat_probe_stats_t *probe_stats); 397 398 static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node, 399 int boot_ncpus); 400 401 static int lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 402 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); 403 404 static int lgrp_plat_process_srat(struct srat *tp, int cpu_count, 405 uint_t *node_cnt, node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, 406 node_phys_addr_map_t *node_memory); 407 408 static int lgrp_plat_srat_domains(struct srat *tp); 409 410 static void lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 411 lgrp_plat_latency_stats_t *lat_stats); 412 413 static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 414 node_phys_addr_map_t *node_memory); 415 416 static hrtime_t opt_probe_vendor(int dest_node, int nreads); 417 418 419 /* 420 * PLATFORM INTERFACE ROUTINES 421 */ 422 423 /* 424 * Configure memory nodes for machines with more than one node (ie NUMA) 425 */ 426 void 427 plat_build_mem_nodes(struct memlist *list) 428 { 429 pfn_t cur_start; /* start addr of subrange */ 430 pfn_t cur_end; /* end addr of subrange */ 431 pfn_t start; /* start addr of whole range */ 432 pfn_t end; /* end addr of whole range */ 433 434 /* 435 * Boot install lists are arranged <addr, len>, ... 436 */ 437 while (list) { 438 int node; 439 440 start = list->address >> PAGESHIFT; 441 end = (list->address + list->size - 1) >> PAGESHIFT; 442 443 if (start > physmax) { 444 list = list->next; 445 continue; 446 } 447 if (end > physmax) 448 end = physmax; 449 450 /* 451 * When there is only one memnode, just add memory to memnode 452 */ 453 if (max_mem_nodes == 1) { 454 mem_node_add_slice(start, end); 455 list = list->next; 456 continue; 457 } 458 459 /* 460 * mem_node_add_slice() expects to get a memory range that 461 * is within one memnode, so need to split any memory range 462 * that spans multiple memnodes into subranges that are each 463 * contained within one memnode when feeding them to 464 * mem_node_add_slice() 465 */ 466 cur_start = start; 467 do { 468 node = plat_pfn_to_mem_node(cur_start); 469 470 /* 471 * Panic if DRAM address map registers or SRAT say 472 * memory in node doesn't exist or address from 473 * boot installed memory list entry isn't in this node. 474 * This shouldn't happen and rest of code can't deal 475 * with this if it does. 476 */ 477 if (node < 0 || node >= lgrp_plat_node_cnt || 478 !lgrp_plat_node_memory[node].exists || 479 cur_start < lgrp_plat_node_memory[node].start || 480 cur_start > lgrp_plat_node_memory[node].end) { 481 cmn_err(CE_PANIC, "Don't know which memnode " 482 "to add installed memory address 0x%lx\n", 483 cur_start); 484 } 485 486 /* 487 * End of current subrange should not span memnodes 488 */ 489 cur_end = end; 490 if (lgrp_plat_node_memory[node].exists && 491 cur_end > lgrp_plat_node_memory[node].end) 492 cur_end = lgrp_plat_node_memory[node].end; 493 494 mem_node_add_slice(cur_start, cur_end); 495 496 /* 497 * Next subrange starts after end of current one 498 */ 499 cur_start = cur_end + 1; 500 } while (cur_end < end); 501 502 list = list->next; 503 } 504 mem_node_physalign = 0; 505 mem_node_pfn_shift = 0; 506 } 507 508 509 int 510 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 511 { 512 if (max_mem_nodes == 1) 513 return (0); 514 515 return ((int)hand); 516 } 517 518 519 /* 520 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 521 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 522 * a crossing is found and returns 0 otherwise. 523 */ 524 int 525 plat_mnode_xcheck(pfn_t pfncnt) 526 { 527 int node, prevnode = -1, basenode; 528 pfn_t ea, sa; 529 530 for (node = 0; node < lgrp_plat_node_cnt; node++) { 531 532 if (lgrp_plat_node_memory[node].exists == 0) 533 continue; 534 535 if (prevnode == -1) { 536 prevnode = node; 537 basenode = node; 538 continue; 539 } 540 541 /* assume x86 node pfn ranges are in increasing order */ 542 ASSERT(lgrp_plat_node_memory[node].start > 543 lgrp_plat_node_memory[prevnode].end); 544 545 /* 546 * continue if the starting address of node is not contiguous 547 * with the previous node. 548 */ 549 550 if (lgrp_plat_node_memory[node].start != 551 (lgrp_plat_node_memory[prevnode].end + 1)) { 552 basenode = node; 553 prevnode = node; 554 continue; 555 } 556 557 /* check if the starting address of node is pfncnt aligned */ 558 if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { 559 560 /* 561 * at this point, node starts at an unaligned boundary 562 * and is contiguous with the previous node(s) to 563 * basenode. Check if there is an aligned contiguous 564 * range of length pfncnt that crosses this boundary. 565 */ 566 567 sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, 568 pfncnt); 569 ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), 570 pfncnt); 571 572 ASSERT((ea - sa) == pfncnt); 573 if (sa >= lgrp_plat_node_memory[basenode].start && 574 ea <= (lgrp_plat_node_memory[node].end + 1)) 575 return (1); 576 } 577 prevnode = node; 578 } 579 return (0); 580 } 581 582 583 lgrp_handle_t 584 plat_mem_node_to_lgrphand(int mnode) 585 { 586 if (max_mem_nodes == 1) 587 return (LGRP_DEFAULT_HANDLE); 588 589 return ((lgrp_handle_t)mnode); 590 } 591 592 593 int 594 plat_pfn_to_mem_node(pfn_t pfn) 595 { 596 int node; 597 598 if (max_mem_nodes == 1) 599 return (0); 600 601 for (node = 0; node < lgrp_plat_node_cnt; node++) { 602 /* 603 * Skip nodes with no memory 604 */ 605 if (!lgrp_plat_node_memory[node].exists) 606 continue; 607 608 if (pfn >= lgrp_plat_node_memory[node].start && 609 pfn <= lgrp_plat_node_memory[node].end) 610 return (node); 611 } 612 613 /* 614 * Didn't find memnode where this PFN lives which should never happen 615 */ 616 ASSERT(node < lgrp_plat_node_cnt); 617 return (-1); 618 } 619 620 621 /* 622 * LGROUP PLATFORM INTERFACE ROUTINES 623 */ 624 625 /* 626 * Allocate additional space for an lgroup. 627 */ 628 /* ARGSUSED */ 629 lgrp_t * 630 lgrp_plat_alloc(lgrp_id_t lgrpid) 631 { 632 lgrp_t *lgrp; 633 634 lgrp = &lgrp_space[nlgrps_alloc++]; 635 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 636 return (NULL); 637 return (lgrp); 638 } 639 640 641 /* 642 * Platform handling for (re)configuration changes 643 */ 644 /* ARGSUSED */ 645 void 646 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 647 { 648 } 649 650 651 /* 652 * Return the platform handle for the lgroup containing the given CPU 653 */ 654 /* ARGSUSED */ 655 lgrp_handle_t 656 lgrp_plat_cpu_to_hand(processorid_t id) 657 { 658 lgrp_handle_t hand; 659 660 if (lgrp_plat_node_cnt == 1) 661 return (LGRP_DEFAULT_HANDLE); 662 663 hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 664 lgrp_plat_cpu_node); 665 666 ASSERT(hand != (lgrp_handle_t)-1); 667 if (hand == (lgrp_handle_t)-1) 668 return (LGRP_NULL_HANDLE); 669 670 return (hand); 671 } 672 673 674 /* 675 * Platform-specific initialization of lgroups 676 */ 677 void 678 lgrp_plat_init(void) 679 { 680 #if defined(__xpv) 681 /* 682 * XXPV For now, the hypervisor treats all memory equally. 683 */ 684 lgrp_plat_node_cnt = max_mem_nodes = 1; 685 #else /* __xpv */ 686 uint_t probe_op; 687 688 /* 689 * Initialize as a UMA machine 690 */ 691 if (lgrp_topo_ht_limit() == 1) { 692 lgrp_plat_node_cnt = max_mem_nodes = 1; 693 return; 694 } 695 696 /* 697 * Read boot property with CPU to APIC ID mapping table/array and fill 698 * in CPU to node ID mapping table with APIC ID for each CPU 699 */ 700 lgrp_plat_cpu_apicid_error = 701 lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node, boot_max_ncpus); 702 703 /* 704 * Determine which CPUs and memory are local to each other and number 705 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 706 */ 707 if (!lgrp_plat_cpu_apicid_error) { 708 lgrp_plat_srat_error = lgrp_plat_process_srat(srat_ptr, 709 boot_max_ncpus, &lgrp_plat_node_cnt, lgrp_plat_node_domain, 710 lgrp_plat_cpu_node, lgrp_plat_node_memory); 711 } 712 713 /* 714 * Try to use PCI config space registers on Opteron if there's an error 715 * processing CPU to APIC ID mapping or SRAT 716 */ 717 if ((lgrp_plat_cpu_apicid_error != 0 || lgrp_plat_srat_error != 0) && 718 is_opteron()) 719 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 720 lgrp_plat_node_memory); 721 722 /* 723 * Don't bother to setup system for multiple lgroups and only use one 724 * memory node when memory is interleaved between any nodes or there is 725 * only one NUMA node 726 * 727 * NOTE: May need to change this for Dynamic Reconfiguration (DR) 728 * when and if it happens for x86/x64 729 */ 730 if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 731 lgrp_plat_node_cnt = max_mem_nodes = 1; 732 (void) lgrp_topo_ht_limit_set(1); 733 return; 734 } 735 736 /* 737 * Leaf lgroups on x86/x64 architectures contain one physical 738 * processor chip. Tune lgrp_expand_proc_thresh and 739 * lgrp_expand_proc_diff so that lgrp_choose() will spread 740 * things out aggressively. 741 */ 742 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 743 lgrp_expand_proc_diff = 0; 744 745 /* 746 * There should be one memnode (physical page free list(s)) for 747 * each node 748 */ 749 max_mem_nodes = lgrp_plat_node_cnt; 750 751 /* 752 * Initialize min and max latency before reading SLIT or probing 753 */ 754 lgrp_plat_lat_stats.latency_min = -1; 755 lgrp_plat_lat_stats.latency_max = 0; 756 757 /* 758 * Determine how far each NUMA node is from each other by 759 * reading ACPI System Locality Information Table (SLIT) if it 760 * exists 761 */ 762 lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 763 lgrp_plat_node_cnt, lgrp_plat_node_memory, 764 &lgrp_plat_lat_stats); 765 if (lgrp_plat_slit_error == 0) 766 return; 767 768 /* 769 * Probe to determine latency between NUMA nodes when SLIT 770 * doesn't exist or make sense 771 */ 772 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 773 774 /* 775 * Specify whether to probe using vendor ID register or page copy 776 * if hasn't been specified already or is overspecified 777 */ 778 probe_op = lgrp_plat_probe_flags & 779 (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 780 781 if (probe_op == 0 || 782 probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 783 lgrp_plat_probe_flags &= 784 ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 785 if (is_opteron()) 786 lgrp_plat_probe_flags |= 787 LGRP_PLAT_PROBE_VENDOR; 788 else 789 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 790 } 791 792 /* 793 * Probing errors can mess up the lgroup topology and 794 * force us fall back to a 2 level lgroup topology. 795 * Here we bound how tall the lgroup topology can grow 796 * in hopes of avoiding any anamolies in probing from 797 * messing up the lgroup topology by limiting the 798 * accuracy of the latency topology. 799 * 800 * Assume that nodes will at least be configured in a 801 * ring, so limit height of lgroup topology to be less 802 * than number of nodes on a system with 4 or more 803 * nodes 804 */ 805 if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 806 lgrp_topo_ht_limit_default()) 807 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 808 #endif /* __xpv */ 809 } 810 811 812 /* 813 * Return latency between "from" and "to" lgroups 814 * 815 * This latency number can only be used for relative comparison 816 * between lgroups on the running system, cannot be used across platforms, 817 * and may not reflect the actual latency. It is platform and implementation 818 * specific, so platform gets to decide its value. It would be nice if the 819 * number was at least proportional to make comparisons more meaningful though. 820 */ 821 /* ARGSUSED */ 822 int 823 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 824 { 825 lgrp_handle_t src, dest; 826 int node; 827 828 if (max_mem_nodes == 1) 829 return (0); 830 831 /* 832 * Return max latency for root lgroup 833 */ 834 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 835 return (lgrp_plat_lat_stats.latency_max); 836 837 src = from; 838 dest = to; 839 840 /* 841 * Return 0 for nodes (lgroup platform handles) out of range 842 */ 843 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 844 return (0); 845 846 /* 847 * Probe from current CPU if its lgroup latencies haven't been set yet 848 * and we are trying to get latency from current CPU to some node 849 */ 850 node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 851 ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 852 if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) 853 lgrp_plat_probe(); 854 855 return (lgrp_plat_lat_stats.latencies[src][dest]); 856 } 857 858 859 /* 860 * Platform-specific initialization 861 */ 862 void 863 lgrp_plat_main_init(void) 864 { 865 int curnode; 866 int ht_limit; 867 int i; 868 869 /* 870 * Print a notice that MPO is disabled when memory is interleaved 871 * across nodes....Would do this when it is discovered, but can't 872 * because it happens way too early during boot.... 873 */ 874 if (lgrp_plat_mem_intrlv) 875 cmn_err(CE_NOTE, 876 "MPO disabled because memory is interleaved\n"); 877 878 /* 879 * Don't bother to do any probing if it is disabled, there is only one 880 * node, or the height of the lgroup topology less than or equal to 2 881 */ 882 ht_limit = lgrp_topo_ht_limit(); 883 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 884 max_mem_nodes == 1 || ht_limit <= 2) { 885 /* 886 * Setup lgroup latencies for 2 level lgroup topology 887 * (ie. local and remote only) if they haven't been set yet 888 */ 889 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 890 lgrp_plat_lat_stats.latency_max == 0) 891 lgrp_plat_2level_setup(lgrp_plat_node_memory, 892 &lgrp_plat_lat_stats); 893 return; 894 } 895 896 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 897 /* 898 * Should have been able to probe from CPU 0 when it was added 899 * to lgroup hierarchy, but may not have been able to then 900 * because it happens so early in boot that gethrtime() hasn't 901 * been initialized. (:-( 902 */ 903 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 904 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 905 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 906 lgrp_plat_probe(); 907 908 return; 909 } 910 911 /* 912 * When probing memory, use one page for every sample to determine 913 * lgroup topology and taking multiple samples 914 */ 915 if (lgrp_plat_probe_mem_config.probe_memsize == 0) 916 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 917 lgrp_plat_probe_nsamples; 918 919 /* 920 * Map memory in each node needed for probing to determine latency 921 * topology 922 */ 923 for (i = 0; i < lgrp_plat_node_cnt; i++) { 924 int mnode; 925 926 /* 927 * Skip this node and leave its probe page NULL 928 * if it doesn't have any memory 929 */ 930 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 931 if (!mem_node_config[mnode].exists) { 932 lgrp_plat_probe_mem_config.probe_va[i] = NULL; 933 continue; 934 } 935 936 /* 937 * Allocate one kernel virtual page 938 */ 939 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 940 lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 941 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 942 cmn_err(CE_WARN, 943 "lgrp_plat_main_init: couldn't allocate memory"); 944 return; 945 } 946 947 /* 948 * Get PFN for first page in each node 949 */ 950 lgrp_plat_probe_mem_config.probe_pfn[i] = 951 mem_node_config[mnode].physbase; 952 953 /* 954 * Map virtual page to first page in node 955 */ 956 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 957 lgrp_plat_probe_mem_config.probe_memsize, 958 lgrp_plat_probe_mem_config.probe_pfn[i], 959 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 960 HAT_LOAD_NOCONSIST); 961 } 962 963 /* 964 * Probe from current CPU 965 */ 966 lgrp_plat_probe(); 967 } 968 969 970 /* 971 * Return the maximum number of lgrps supported by the platform. 972 * Before lgrp topology is known it returns an estimate based on the number of 973 * nodes. Once topology is known it returns the actual maximim number of lgrps 974 * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and 975 * dynamic addition of new nodes, this number may not grow during system 976 * lifetime (yet). 977 */ 978 int 979 lgrp_plat_max_lgrps(void) 980 { 981 return (lgrp_topo_initialized ? 982 lgrp_alloc_max + 1 : 983 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 984 } 985 986 987 /* 988 * Return the number of free pages in an lgroup. 989 * 990 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 991 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 992 * number of allocatable base pagesize pages corresponding to the 993 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 994 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 995 * memory installed, regardless of whether or not it's usable. 996 */ 997 pgcnt_t 998 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 999 { 1000 int mnode; 1001 pgcnt_t npgs = (pgcnt_t)0; 1002 extern struct memlist *phys_avail; 1003 extern struct memlist *phys_install; 1004 1005 1006 if (plathand == LGRP_DEFAULT_HANDLE) 1007 return (lgrp_plat_mem_size_default(plathand, query)); 1008 1009 if (plathand != LGRP_NULL_HANDLE) { 1010 mnode = plat_lgrphand_to_mem_node(plathand); 1011 if (mnode >= 0 && mem_node_config[mnode].exists) { 1012 switch (query) { 1013 case LGRP_MEM_SIZE_FREE: 1014 npgs = MNODE_PGCNT(mnode); 1015 break; 1016 case LGRP_MEM_SIZE_AVAIL: 1017 npgs = mem_node_memlist_pages(mnode, 1018 phys_avail); 1019 break; 1020 case LGRP_MEM_SIZE_INSTALL: 1021 npgs = mem_node_memlist_pages(mnode, 1022 phys_install); 1023 break; 1024 default: 1025 break; 1026 } 1027 } 1028 } 1029 return (npgs); 1030 } 1031 1032 1033 /* 1034 * Return the platform handle of the lgroup that contains the physical memory 1035 * corresponding to the given page frame number 1036 */ 1037 /* ARGSUSED */ 1038 lgrp_handle_t 1039 lgrp_plat_pfn_to_hand(pfn_t pfn) 1040 { 1041 int mnode; 1042 1043 if (max_mem_nodes == 1) 1044 return (LGRP_DEFAULT_HANDLE); 1045 1046 if (pfn > physmax) 1047 return (LGRP_NULL_HANDLE); 1048 1049 mnode = plat_pfn_to_mem_node(pfn); 1050 if (mnode < 0) 1051 return (LGRP_NULL_HANDLE); 1052 1053 return (MEM_NODE_2_LGRPHAND(mnode)); 1054 } 1055 1056 1057 /* 1058 * Probe memory in each node from current CPU to determine latency topology 1059 * 1060 * The probing code will probe the vendor ID register on the Northbridge of 1061 * Opteron processors and probe memory for other processors by default. 1062 * 1063 * Since probing is inherently error prone, the code takes laps across all the 1064 * nodes probing from each node to each of the other nodes some number of 1065 * times. Furthermore, each node is probed some number of times before moving 1066 * onto the next one during each lap. The minimum latency gotten between nodes 1067 * is kept as the latency between the nodes. 1068 * 1069 * After all that, the probe times are adjusted by normalizing values that are 1070 * close to each other and local latencies are made the same. Lastly, the 1071 * latencies are verified to make sure that certain conditions are met (eg. 1072 * local < remote, latency(a, b) == latency(b, a), etc.). 1073 * 1074 * If any of the conditions aren't met, the code will export a NUMA 1075 * configuration with the local CPUs and memory given by the SRAT or PCI config 1076 * space registers and one remote memory latency since it can't tell exactly 1077 * how far each node is from each other. 1078 */ 1079 void 1080 lgrp_plat_probe(void) 1081 { 1082 int from; 1083 int i; 1084 lgrp_plat_latency_stats_t *lat_stats; 1085 hrtime_t probe_time; 1086 int to; 1087 1088 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1089 max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1090 return; 1091 1092 /* 1093 * Determine ID of node containing current CPU 1094 */ 1095 from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 1096 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1097 if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 1098 ASSERT(lgrp_plat_node_domain[from].exists); 1099 1100 /* 1101 * Don't need to probe if got times already 1102 */ 1103 lat_stats = &lgrp_plat_lat_stats; 1104 if (lat_stats->latencies[from][from] != 0) 1105 return; 1106 1107 /* 1108 * Read vendor ID in Northbridge or read and write page(s) 1109 * in each node from current CPU and remember how long it takes, 1110 * so we can build latency topology of machine later. 1111 * This should approximate the memory latency between each node. 1112 */ 1113 for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1114 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1115 /* 1116 * Get probe time and bail out if can't get it yet 1117 */ 1118 probe_time = lgrp_plat_probe_time(to, 1119 lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config, 1120 &lgrp_plat_lat_stats, &lgrp_plat_probe_stats); 1121 if (probe_time == 0) 1122 return; 1123 1124 /* 1125 * Keep lowest probe time as latency between nodes 1126 */ 1127 if (lat_stats->latencies[from][to] == 0 || 1128 probe_time < lat_stats->latencies[from][to]) 1129 lat_stats->latencies[from][to] = probe_time; 1130 1131 /* 1132 * Update overall minimum and maximum probe times 1133 * across all nodes 1134 */ 1135 if (probe_time < lat_stats->latency_min || 1136 lat_stats->latency_min == -1) 1137 lat_stats->latency_min = probe_time; 1138 if (probe_time > lat_stats->latency_max) 1139 lat_stats->latency_max = probe_time; 1140 } 1141 } 1142 1143 /* 1144 * - Fix up latencies such that local latencies are same, 1145 * latency(i, j) == latency(j, i), etc. (if possible) 1146 * 1147 * - Verify that latencies look ok 1148 * 1149 * - Fallback to just optimizing for local and remote if 1150 * latencies didn't look right 1151 */ 1152 lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats, 1153 &lgrp_plat_probe_stats); 1154 lgrp_plat_probe_stats.probe_error_code = 1155 lgrp_plat_latency_verify(lgrp_plat_node_memory, 1156 &lgrp_plat_lat_stats); 1157 if (lgrp_plat_probe_stats.probe_error_code) 1158 lgrp_plat_2level_setup(lgrp_plat_node_memory, 1159 &lgrp_plat_lat_stats); 1160 } 1161 1162 1163 /* 1164 * Return platform handle for root lgroup 1165 */ 1166 lgrp_handle_t 1167 lgrp_plat_root_hand(void) 1168 { 1169 return (LGRP_DEFAULT_HANDLE); 1170 } 1171 1172 1173 /* 1174 * INTERNAL ROUTINES 1175 */ 1176 1177 1178 /* 1179 * Update CPU to node mapping for given CPU and proximity domain (and returns 1180 * negative numbers for errors and positive ones for success) 1181 */ 1182 static int 1183 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 1184 cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain) 1185 { 1186 uint_t i; 1187 int node; 1188 1189 /* 1190 * Get node number for proximity domain 1191 */ 1192 node = lgrp_plat_domain_to_node(node_domain, domain); 1193 if (node == -1) { 1194 node = lgrp_plat_node_domain_update(node_domain, domain); 1195 if (node == -1) 1196 return (-1); 1197 } 1198 1199 /* 1200 * Search for entry with given APIC ID and fill in its node and 1201 * proximity domain IDs (if they haven't been set already) 1202 */ 1203 for (i = 0; i < nentries; i++) { 1204 /* 1205 * Skip nonexistent entries and ones without matching APIC ID 1206 */ 1207 if (!cpu_node[i].exists || cpu_node[i].apicid != apicid) 1208 continue; 1209 1210 /* 1211 * Just return if entry completely and correctly filled in 1212 * already 1213 */ 1214 if (cpu_node[i].prox_domain == domain && 1215 cpu_node[i].node == node) 1216 return (1); 1217 1218 /* 1219 * Fill in node and proximity domain IDs 1220 */ 1221 cpu_node[i].prox_domain = domain; 1222 cpu_node[i].node = node; 1223 1224 return (0); 1225 } 1226 1227 /* 1228 * Return error when entry for APIC ID wasn't found in table 1229 */ 1230 return (-2); 1231 } 1232 1233 1234 /* 1235 * Get node ID for given CPU 1236 */ 1237 static int 1238 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node) 1239 { 1240 processorid_t cpuid; 1241 1242 if (cp == NULL) 1243 return (-1); 1244 1245 cpuid = cp->cpu_id; 1246 if (cpuid < 0 || cpuid >= max_ncpus) 1247 return (-1); 1248 1249 /* 1250 * SRAT doesn't exist, isn't enabled, or there was an error processing 1251 * it, so return chip ID for Opteron and -1 otherwise. 1252 */ 1253 if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1254 lgrp_plat_srat_error) { 1255 if (is_opteron()) 1256 return (pg_plat_hw_instance_id(cp, PGHW_CHIP)); 1257 return (-1); 1258 } 1259 1260 /* 1261 * Return -1 when CPU to node ID mapping entry doesn't exist for given 1262 * CPU 1263 */ 1264 if (!cpu_node[cpuid].exists) 1265 return (-1); 1266 1267 return (cpu_node[cpuid].node); 1268 } 1269 1270 1271 /* 1272 * Return node number for given proximity domain/system locality 1273 */ 1274 static int 1275 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, uint32_t domain) 1276 { 1277 uint_t node; 1278 uint_t start; 1279 1280 /* 1281 * Hash proximity domain ID into node to domain mapping table (array), 1282 * search for entry with matching proximity domain ID, and return index 1283 * of matching entry as node ID. 1284 */ 1285 node = start = NODE_DOMAIN_HASH(domain); 1286 do { 1287 if (node_domain[node].prox_domain == domain && 1288 node_domain[node].exists) 1289 return (node); 1290 node = NODE_DOMAIN_HASH(node + 1); 1291 } while (node != start); 1292 return (-1); 1293 } 1294 1295 1296 /* 1297 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1298 * be considered same 1299 */ 1300 #define LGRP_LAT_TOLERANCE_SHIFT 4 1301 1302 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1303 1304 1305 /* 1306 * Adjust latencies between nodes to be symmetric, normalize latencies between 1307 * any nodes that are within some tolerance to be same, and make local 1308 * latencies be same 1309 */ 1310 static void 1311 lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 1312 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1313 { 1314 int i; 1315 int j; 1316 int k; 1317 int l; 1318 u_longlong_t max; 1319 u_longlong_t min; 1320 u_longlong_t t; 1321 u_longlong_t t1; 1322 u_longlong_t t2; 1323 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1324 int lat_corrected[MAX_NODES][MAX_NODES]; 1325 1326 /* 1327 * Nothing to do when this is an UMA machine or don't have args needed 1328 */ 1329 if (max_mem_nodes == 1) 1330 return; 1331 1332 ASSERT(node_memory != NULL && lat_stats != NULL && 1333 probe_stats != NULL); 1334 1335 /* 1336 * Make sure that latencies are symmetric between any two nodes 1337 * (ie. latency(node0, node1) == latency(node1, node0)) 1338 */ 1339 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1340 if (!node_memory[i].exists) 1341 continue; 1342 1343 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1344 if (!node_memory[j].exists) 1345 continue; 1346 1347 t1 = lat_stats->latencies[i][j]; 1348 t2 = lat_stats->latencies[j][i]; 1349 1350 if (t1 == 0 || t2 == 0 || t1 == t2) 1351 continue; 1352 1353 /* 1354 * Latencies should be same 1355 * - Use minimum of two latencies which should be same 1356 * - Track suspect probe times not within tolerance of 1357 * min value 1358 * - Remember how much values are corrected by 1359 */ 1360 if (t1 > t2) { 1361 t = t2; 1362 probe_stats->probe_errors[i][j] += t1 - t2; 1363 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1364 probe_stats->probe_suspect[i][j]++; 1365 probe_stats->probe_suspect[j][i]++; 1366 } 1367 } else if (t2 > t1) { 1368 t = t1; 1369 probe_stats->probe_errors[j][i] += t2 - t1; 1370 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1371 probe_stats->probe_suspect[i][j]++; 1372 probe_stats->probe_suspect[j][i]++; 1373 } 1374 } 1375 1376 lat_stats->latencies[i][j] = 1377 lat_stats->latencies[j][i] = t; 1378 lgrp_config(cflag, t1, t); 1379 lgrp_config(cflag, t2, t); 1380 } 1381 } 1382 1383 /* 1384 * Keep track of which latencies get corrected 1385 */ 1386 for (i = 0; i < MAX_NODES; i++) 1387 for (j = 0; j < MAX_NODES; j++) 1388 lat_corrected[i][j] = 0; 1389 1390 /* 1391 * For every two nodes, see whether there is another pair of nodes which 1392 * are about the same distance apart and make the latencies be the same 1393 * if they are close enough together 1394 */ 1395 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1396 if (!node_memory[i].exists) 1397 continue; 1398 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1399 if (!node_memory[j].exists) 1400 continue; 1401 /* 1402 * Pick one pair of nodes (i, j) 1403 * and get latency between them 1404 */ 1405 t1 = lat_stats->latencies[i][j]; 1406 1407 /* 1408 * Skip this pair of nodes if there isn't a latency 1409 * for it yet 1410 */ 1411 if (t1 == 0) 1412 continue; 1413 1414 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1415 if (!node_memory[k].exists) 1416 continue; 1417 for (l = 0; l < lgrp_plat_node_cnt; l++) { 1418 if (!node_memory[l].exists) 1419 continue; 1420 /* 1421 * Pick another pair of nodes (k, l) 1422 * not same as (i, j) and get latency 1423 * between them 1424 */ 1425 if (k == i && l == j) 1426 continue; 1427 1428 t2 = lat_stats->latencies[k][l]; 1429 1430 /* 1431 * Skip this pair of nodes if there 1432 * isn't a latency for it yet 1433 */ 1434 1435 if (t2 == 0) 1436 continue; 1437 1438 /* 1439 * Skip nodes (k, l) if they already 1440 * have same latency as (i, j) or 1441 * their latency isn't close enough to 1442 * be considered/made the same 1443 */ 1444 if (t1 == t2 || (t1 > t2 && t1 - t2 > 1445 t1 >> lgrp_plat_probe_lt_shift) || 1446 (t2 > t1 && t2 - t1 > 1447 t2 >> lgrp_plat_probe_lt_shift)) 1448 continue; 1449 1450 /* 1451 * Make latency(i, j) same as 1452 * latency(k, l), try to use latency 1453 * that has been adjusted already to get 1454 * more consistency (if possible), and 1455 * remember which latencies were 1456 * adjusted for next time 1457 */ 1458 if (lat_corrected[i][j]) { 1459 t = t1; 1460 lgrp_config(cflag, t2, t); 1461 t2 = t; 1462 } else if (lat_corrected[k][l]) { 1463 t = t2; 1464 lgrp_config(cflag, t1, t); 1465 t1 = t; 1466 } else { 1467 if (t1 > t2) 1468 t = t2; 1469 else 1470 t = t1; 1471 lgrp_config(cflag, t1, t); 1472 lgrp_config(cflag, t2, t); 1473 t1 = t2 = t; 1474 } 1475 1476 lat_stats->latencies[i][j] = 1477 lat_stats->latencies[k][l] = t; 1478 1479 lat_corrected[i][j] = 1480 lat_corrected[k][l] = 1; 1481 } 1482 } 1483 } 1484 } 1485 1486 /* 1487 * Local latencies should be same 1488 * - Find min and max local latencies 1489 * - Make all local latencies be minimum 1490 */ 1491 min = -1; 1492 max = 0; 1493 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1494 if (!node_memory[i].exists) 1495 continue; 1496 t = lat_stats->latencies[i][i]; 1497 if (t == 0) 1498 continue; 1499 if (min == -1 || t < min) 1500 min = t; 1501 if (t > max) 1502 max = t; 1503 } 1504 if (min != max) { 1505 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1506 int local; 1507 1508 if (!node_memory[i].exists) 1509 continue; 1510 1511 local = lat_stats->latencies[i][i]; 1512 if (local == 0) 1513 continue; 1514 1515 /* 1516 * Track suspect probe times that aren't within 1517 * tolerance of minimum local latency and how much 1518 * probe times are corrected by 1519 */ 1520 if (local - min > min >> lgrp_plat_probe_lt_shift) 1521 probe_stats->probe_suspect[i][i]++; 1522 1523 probe_stats->probe_errors[i][i] += local - min; 1524 1525 /* 1526 * Make local latencies be minimum 1527 */ 1528 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1529 lat_stats->latencies[i][i] = min; 1530 } 1531 } 1532 1533 /* 1534 * Determine max probe time again since just adjusted latencies 1535 */ 1536 lat_stats->latency_max = 0; 1537 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1538 if (!node_memory[i].exists) 1539 continue; 1540 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1541 if (!node_memory[j].exists) 1542 continue; 1543 t = lat_stats->latencies[i][j]; 1544 if (t > lat_stats->latency_max) 1545 lat_stats->latency_max = t; 1546 } 1547 } 1548 } 1549 1550 1551 /* 1552 * Verify following about latencies between nodes: 1553 * 1554 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1555 * - Local latencies same 1556 * - Local < remote 1557 * - Number of latencies seen is reasonable 1558 * - Number of occurrences of a given latency should be more than 1 1559 * 1560 * Returns: 1561 * 0 Success 1562 * -1 Not symmetric 1563 * -2 Local latencies not same 1564 * -3 Local >= remote 1565 */ 1566 static int 1567 lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 1568 lgrp_plat_latency_stats_t *lat_stats) 1569 { 1570 int i; 1571 int j; 1572 u_longlong_t t1; 1573 u_longlong_t t2; 1574 1575 ASSERT(node_memory != NULL && lat_stats != NULL); 1576 1577 /* 1578 * Nothing to do when this is an UMA machine, lgroup topology is 1579 * limited to 2 levels, or there aren't any probe times yet 1580 */ 1581 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1582 lat_stats->latencies[0][0] == 0) 1583 return (0); 1584 1585 /* 1586 * Make sure that latencies are symmetric between any two nodes 1587 * (ie. latency(node0, node1) == latency(node1, node0)) 1588 */ 1589 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1590 if (!node_memory[i].exists) 1591 continue; 1592 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1593 if (!node_memory[j].exists) 1594 continue; 1595 t1 = lat_stats->latencies[i][j]; 1596 t2 = lat_stats->latencies[j][i]; 1597 1598 if (t1 == 0 || t2 == 0 || t1 == t2) 1599 continue; 1600 1601 return (-1); 1602 } 1603 } 1604 1605 /* 1606 * Local latencies should be same 1607 */ 1608 t1 = lat_stats->latencies[0][0]; 1609 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1610 if (!node_memory[i].exists) 1611 continue; 1612 1613 t2 = lat_stats->latencies[i][i]; 1614 if (t2 == 0) 1615 continue; 1616 1617 if (t1 == 0) { 1618 t1 = t2; 1619 continue; 1620 } 1621 1622 if (t1 != t2) 1623 return (-2); 1624 } 1625 1626 /* 1627 * Local latencies should be less than remote 1628 */ 1629 if (t1) { 1630 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1631 if (!node_memory[i].exists) 1632 continue; 1633 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1634 if (!node_memory[j].exists) 1635 continue; 1636 t2 = lat_stats->latencies[i][j]; 1637 if (i == j || t2 == 0) 1638 continue; 1639 1640 if (t1 >= t2) 1641 return (-3); 1642 } 1643 } 1644 } 1645 1646 return (0); 1647 } 1648 1649 1650 /* 1651 * Return the number of free, allocatable, or installed 1652 * pages in an lgroup 1653 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1654 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1655 */ 1656 /* ARGSUSED */ 1657 static pgcnt_t 1658 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1659 { 1660 struct memlist *mlist; 1661 pgcnt_t npgs = 0; 1662 extern struct memlist *phys_avail; 1663 extern struct memlist *phys_install; 1664 1665 switch (query) { 1666 case LGRP_MEM_SIZE_FREE: 1667 return ((pgcnt_t)freemem); 1668 case LGRP_MEM_SIZE_AVAIL: 1669 memlist_read_lock(); 1670 for (mlist = phys_avail; mlist; mlist = mlist->next) 1671 npgs += btop(mlist->size); 1672 memlist_read_unlock(); 1673 return (npgs); 1674 case LGRP_MEM_SIZE_INSTALL: 1675 memlist_read_lock(); 1676 for (mlist = phys_install; mlist; mlist = mlist->next) 1677 npgs += btop(mlist->size); 1678 memlist_read_unlock(); 1679 return (npgs); 1680 default: 1681 return ((pgcnt_t)0); 1682 } 1683 } 1684 1685 1686 /* 1687 * Update node to proximity domain mappings for given domain and return node ID 1688 */ 1689 static int 1690 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, uint32_t domain) 1691 { 1692 uint_t node; 1693 uint_t start; 1694 1695 /* 1696 * Hash proximity domain ID into node to domain mapping table (array) 1697 * and add entry for it into first non-existent or matching entry found 1698 */ 1699 node = start = NODE_DOMAIN_HASH(domain); 1700 do { 1701 /* 1702 * Entry doesn't exist yet, so create one for this proximity 1703 * domain and return node ID which is index into mapping table. 1704 */ 1705 if (!node_domain[node].exists) { 1706 node_domain[node].exists = 1; 1707 node_domain[node].prox_domain = domain; 1708 return (node); 1709 } 1710 1711 /* 1712 * Entry exists for this proximity domain already, so just 1713 * return node ID (index into table). 1714 */ 1715 if (node_domain[node].prox_domain == domain) 1716 return (node); 1717 node = NODE_DOMAIN_HASH(node + 1); 1718 } while (node != start); 1719 1720 /* 1721 * Ran out of supported number of entries which shouldn't happen.... 1722 */ 1723 ASSERT(node != start); 1724 return (-1); 1725 } 1726 1727 1728 /* 1729 * Update node memory information for given proximity domain with specified 1730 * starting and ending physical address range (and return positive numbers for 1731 * success and negative ones for errors) 1732 */ 1733 static int 1734 lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 1735 node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, 1736 uint32_t domain) 1737 { 1738 int node; 1739 1740 /* 1741 * Get node number for proximity domain 1742 */ 1743 node = lgrp_plat_domain_to_node(node_domain, domain); 1744 if (node == -1) { 1745 node = lgrp_plat_node_domain_update(node_domain, domain); 1746 if (node == -1) 1747 return (-1); 1748 } 1749 1750 /* 1751 * Create entry in table for node if it doesn't exist 1752 */ 1753 if (!node_memory[node].exists) { 1754 node_memory[node].exists = 1; 1755 node_memory[node].start = btop(start); 1756 node_memory[node].end = btop(end); 1757 node_memory[node].prox_domain = domain; 1758 return (0); 1759 } 1760 1761 /* 1762 * Entry already exists for this proximity domain 1763 * 1764 * There may be more than one SRAT memory entry for a domain, so we may 1765 * need to update existing start or end address for the node. 1766 */ 1767 if (node_memory[node].prox_domain == domain) { 1768 if (btop(start) < node_memory[node].start) 1769 node_memory[node].start = btop(start); 1770 if (btop(end) > node_memory[node].end) 1771 node_memory[node].end = btop(end); 1772 return (1); 1773 } 1774 return (-2); 1775 } 1776 1777 1778 /* 1779 * Return time needed to probe from current CPU to memory in given node 1780 */ 1781 static hrtime_t 1782 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 1783 lgrp_plat_probe_mem_config_t *probe_mem_config, 1784 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1785 { 1786 caddr_t buf; 1787 hrtime_t elapsed; 1788 hrtime_t end; 1789 int from; 1790 int i; 1791 int ipl; 1792 hrtime_t max; 1793 hrtime_t min; 1794 hrtime_t start; 1795 extern int use_sse_pagecopy; 1796 1797 /* 1798 * Determine ID of node containing current CPU 1799 */ 1800 from = lgrp_plat_cpu_to_node(CPU, cpu_node); 1801 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1802 1803 /* 1804 * Do common work for probing main memory 1805 */ 1806 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 1807 /* 1808 * Skip probing any nodes without memory and 1809 * set probe time to 0 1810 */ 1811 if (probe_mem_config->probe_va[to] == NULL) { 1812 lat_stats->latencies[from][to] = 0; 1813 return (0); 1814 } 1815 1816 /* 1817 * Invalidate caches once instead of once every sample 1818 * which should cut cost of probing by a lot 1819 */ 1820 probe_stats->flush_cost = gethrtime(); 1821 invalidate_cache(); 1822 probe_stats->flush_cost = gethrtime() - 1823 probe_stats->flush_cost; 1824 probe_stats->probe_cost_total += probe_stats->flush_cost; 1825 } 1826 1827 /* 1828 * Probe from current CPU to given memory using specified operation 1829 * and take specified number of samples 1830 */ 1831 max = 0; 1832 min = -1; 1833 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1834 probe_stats->probe_cost = gethrtime(); 1835 1836 /* 1837 * Can't measure probe time if gethrtime() isn't working yet 1838 */ 1839 if (probe_stats->probe_cost == 0 && gethrtime() == 0) 1840 return (0); 1841 1842 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 1843 /* 1844 * Measure how long it takes to read vendor ID from 1845 * Northbridge 1846 */ 1847 elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 1848 } else { 1849 /* 1850 * Measure how long it takes to copy page 1851 * on top of itself 1852 */ 1853 buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 1854 1855 kpreempt_disable(); 1856 ipl = splhigh(); 1857 start = gethrtime(); 1858 if (use_sse_pagecopy) 1859 hwblkpagecopy(buf, buf); 1860 else 1861 bcopy(buf, buf, PAGESIZE); 1862 end = gethrtime(); 1863 elapsed = end - start; 1864 splx(ipl); 1865 kpreempt_enable(); 1866 } 1867 1868 probe_stats->probe_cost = gethrtime() - 1869 probe_stats->probe_cost; 1870 probe_stats->probe_cost_total += probe_stats->probe_cost; 1871 1872 if (min == -1 || elapsed < min) 1873 min = elapsed; 1874 if (elapsed > max) 1875 max = elapsed; 1876 } 1877 1878 /* 1879 * Update minimum and maximum probe times between 1880 * these two nodes 1881 */ 1882 if (min < probe_stats->probe_min[from][to] || 1883 probe_stats->probe_min[from][to] == 0) 1884 probe_stats->probe_min[from][to] = min; 1885 1886 if (max > probe_stats->probe_max[from][to]) 1887 probe_stats->probe_max[from][to] = max; 1888 1889 return (min); 1890 } 1891 1892 1893 /* 1894 * Read boot property with CPU to APIC ID array and fill in CPU to node ID 1895 * mapping table with APIC ID for each CPU 1896 * 1897 * NOTE: This code assumes that CPU IDs are assigned in order that they appear 1898 * in in cpu_apicid_array boot property which is based on and follows 1899 * same ordering as processor list in ACPI MADT. If the code in 1900 * usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns 1901 * CPU IDs ever changes, then this code will need to change too.... 1902 */ 1903 static int 1904 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node, int boot_ncpus) 1905 { 1906 char *boot_prop_name = BP_CPU_APICID_ARRAY; 1907 uint8_t cpu_apicid_array[UINT8_MAX + 1]; 1908 int i; 1909 int boot_prop_len; 1910 1911 /* 1912 * Nothing to do when no array to fill in or not enough CPUs 1913 */ 1914 if (cpu_node == NULL || boot_ncpus <= 1) 1915 return (1); 1916 1917 /* 1918 * Check length of property value 1919 */ 1920 boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); 1921 if (boot_prop_len <= 0 || boot_prop_len > UINT8_MAX) 1922 return (2); 1923 1924 /* 1925 * Get CPU to APIC ID property value 1926 */ 1927 if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) 1928 return (3); 1929 1930 /* 1931 * Fill in CPU to node ID mapping table with APIC ID for each CPU 1932 */ 1933 for (i = 0; i < boot_ncpus; i++) { 1934 cpu_node[i].exists = 1; 1935 cpu_node[i].apicid = cpu_apicid_array[i]; 1936 } 1937 1938 return (0); 1939 } 1940 1941 1942 /* 1943 * Read ACPI System Locality Information Table (SLIT) to determine how far each 1944 * NUMA node is from each other 1945 */ 1946 static int 1947 lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 1948 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats) 1949 { 1950 int i; 1951 int j; 1952 int localities; 1953 hrtime_t max; 1954 hrtime_t min; 1955 int retval; 1956 uint8_t *slit_entries; 1957 1958 if (tp == NULL || !lgrp_plat_slit_enable) 1959 return (1); 1960 1961 if (lat_stats == NULL) 1962 return (2); 1963 1964 localities = tp->number; 1965 if (localities != node_cnt) 1966 return (3); 1967 1968 min = lat_stats->latency_min; 1969 max = lat_stats->latency_max; 1970 1971 /* 1972 * Fill in latency matrix based on SLIT entries 1973 */ 1974 slit_entries = tp->entry; 1975 for (i = 0; i < localities; i++) { 1976 for (j = 0; j < localities; j++) { 1977 uint8_t latency; 1978 1979 latency = slit_entries[(i * localities) + j]; 1980 lat_stats->latencies[i][j] = latency; 1981 if (latency < min || min == -1) 1982 min = latency; 1983 if (latency > max) 1984 max = latency; 1985 } 1986 } 1987 1988 /* 1989 * Verify that latencies/distances given in SLIT look reasonable 1990 */ 1991 retval = lgrp_plat_latency_verify(node_memory, lat_stats); 1992 1993 if (retval) { 1994 /* 1995 * Reinitialize (zero) latency table since SLIT doesn't look 1996 * right 1997 */ 1998 for (i = 0; i < localities; i++) { 1999 for (j = 0; j < localities; j++) 2000 lat_stats->latencies[i][j] = 0; 2001 } 2002 } else { 2003 /* 2004 * Update min and max latencies seen since SLIT looks valid 2005 */ 2006 lat_stats->latency_min = min; 2007 lat_stats->latency_max = max; 2008 } 2009 2010 return (retval); 2011 } 2012 2013 2014 /* 2015 * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 2016 * and memory are local to each other in the same NUMA node 2017 */ 2018 static int 2019 lgrp_plat_process_srat(struct srat *tp, int cpu_count, uint_t *node_cnt, 2020 node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, 2021 node_phys_addr_map_t *node_memory) 2022 { 2023 struct srat_item *srat_end; 2024 int i; 2025 struct srat_item *item; 2026 int proc_entry_count; 2027 2028 if (tp == NULL || !lgrp_plat_srat_enable) 2029 return (1); 2030 2031 /* 2032 * Determine number of nodes by counting number of proximity domains in 2033 * SRAT 2034 */ 2035 if (node_cnt) { 2036 int nodes; 2037 2038 nodes = lgrp_plat_srat_domains(tp); 2039 if (nodes < 0) { 2040 *node_cnt = 1; 2041 return (2); 2042 } 2043 *node_cnt = nodes; 2044 } 2045 2046 /* 2047 * Walk through SRAT, examining each CPU and memory entry to determine 2048 * which CPUs and memory belong to which node. 2049 */ 2050 item = tp->list; 2051 srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2052 proc_entry_count = 0; 2053 while (item < srat_end) { 2054 uint32_t apic_id; 2055 uint32_t domain; 2056 uint64_t end; 2057 uint64_t length; 2058 uint64_t start; 2059 2060 switch (item->type) { 2061 case SRAT_PROCESSOR: /* CPU entry */ 2062 if (!(item->i.p.flags & SRAT_ENABLED) || 2063 cpu_node == NULL) 2064 break; 2065 2066 /* 2067 * Calculate domain (node) ID and fill in APIC ID to 2068 * domain/node mapping table 2069 */ 2070 domain = item->i.p.domain1; 2071 for (i = 0; i < 3; i++) { 2072 domain += item->i.p.domain2[i] << 2073 ((i + 1) * 8); 2074 } 2075 apic_id = item->i.p.apic_id; 2076 2077 if (lgrp_plat_cpu_node_update(node_domain, cpu_node, 2078 cpu_count, apic_id, domain) < 0) 2079 return (3); 2080 2081 proc_entry_count++; 2082 break; 2083 2084 case SRAT_MEMORY: /* memory entry */ 2085 if (!(item->i.m.flags & SRAT_ENABLED) || 2086 node_memory == NULL) 2087 break; 2088 2089 /* 2090 * Get domain (node) ID and fill in domain/node 2091 * to memory mapping table 2092 */ 2093 domain = item->i.m.domain; 2094 start = item->i.m.base_addr; 2095 length = item->i.m.len; 2096 end = start + length - 1; 2097 2098 if (lgrp_plat_node_memory_update(node_domain, 2099 node_memory, start, end, domain) < 0) 2100 return (4); 2101 break; 2102 2103 default: 2104 break; 2105 } 2106 2107 item = (struct srat_item *)((uintptr_t)item + item->len); 2108 } 2109 2110 /* 2111 * Should have seen at least as many SRAT processor entries as CPUs 2112 */ 2113 if (proc_entry_count >= cpu_count) 2114 return (5); 2115 2116 return (0); 2117 } 2118 2119 2120 /* 2121 * Return number of proximity domains given in ACPI SRAT 2122 */ 2123 static int 2124 lgrp_plat_srat_domains(struct srat *tp) 2125 { 2126 int domain_cnt; 2127 struct srat_item *end; 2128 int i; 2129 struct srat_item *item; 2130 node_domain_map_t node_domain[MAX_NODES]; 2131 2132 2133 if (tp == NULL || !lgrp_plat_srat_enable) 2134 return (1); 2135 2136 /* 2137 * Walk through SRAT, examining each CPU and memory entry to determine 2138 * proximity domain ID for each. 2139 */ 2140 domain_cnt = 0; 2141 item = tp->list; 2142 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2143 bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 2144 while (item < end) { 2145 uint32_t domain; 2146 boolean_t overflow; 2147 uint_t start; 2148 2149 switch (item->type) { 2150 case SRAT_PROCESSOR: /* CPU entry */ 2151 if (!(item->i.p.flags & SRAT_ENABLED)) 2152 break; 2153 domain = item->i.p.domain1; 2154 for (i = 0; i < 3; i++) { 2155 domain += item->i.p.domain2[i] << 2156 ((i + 1) * 8); 2157 } 2158 break; 2159 2160 case SRAT_MEMORY: /* memory entry */ 2161 if (!(item->i.m.flags & SRAT_ENABLED)) 2162 break; 2163 domain = item->i.m.domain; 2164 break; 2165 2166 default: 2167 break; 2168 } 2169 2170 /* 2171 * Count and keep track of which proximity domain IDs seen 2172 */ 2173 start = i = domain % MAX_NODES; 2174 overflow = B_TRUE; 2175 do { 2176 /* 2177 * Create entry for proximity domain and increment 2178 * count when no entry exists where proximity domain 2179 * hashed 2180 */ 2181 if (!node_domain[i].exists) { 2182 node_domain[i].exists = 1; 2183 node_domain[i].prox_domain = domain; 2184 domain_cnt++; 2185 overflow = B_FALSE; 2186 break; 2187 } 2188 2189 /* 2190 * Nothing to do when proximity domain seen already 2191 * and its entry exists 2192 */ 2193 if (node_domain[i].prox_domain == domain) { 2194 overflow = B_FALSE; 2195 break; 2196 } 2197 2198 /* 2199 * Entry exists where proximity domain hashed, but for 2200 * different proximity domain so keep search for empty 2201 * slot to put it or matching entry whichever comes 2202 * first. 2203 */ 2204 i = (i + 1) % MAX_NODES; 2205 } while (i != start); 2206 2207 /* 2208 * Didn't find empty or matching entry which means have more 2209 * proximity domains than supported nodes (:-( 2210 */ 2211 ASSERT(overflow != B_TRUE); 2212 if (overflow == B_TRUE) 2213 return (-1); 2214 2215 item = (struct srat_item *)((uintptr_t)item + item->len); 2216 } 2217 return (domain_cnt); 2218 } 2219 2220 2221 /* 2222 * Set lgroup latencies for 2 level lgroup topology 2223 */ 2224 static void 2225 lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 2226 lgrp_plat_latency_stats_t *lat_stats) 2227 { 2228 int i; 2229 2230 ASSERT(node_memory != NULL && lat_stats != NULL); 2231 2232 if (lgrp_plat_node_cnt >= 4) 2233 cmn_err(CE_NOTE, 2234 "MPO only optimizing for local and remote\n"); 2235 for (i = 0; i < lgrp_plat_node_cnt; i++) { 2236 int j; 2237 2238 if (!node_memory[i].exists) 2239 continue; 2240 for (j = 0; j < lgrp_plat_node_cnt; j++) { 2241 if (!node_memory[j].exists) 2242 continue; 2243 if (i == j) 2244 lat_stats->latencies[i][j] = 2; 2245 else 2246 lat_stats->latencies[i][j] = 3; 2247 } 2248 } 2249 lat_stats->latency_min = 2; 2250 lat_stats->latency_max = 3; 2251 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 2252 } 2253 2254 2255 /* 2256 * The following Opteron specific constants, macros, types, and routines define 2257 * PCI configuration space registers and how to read them to determine the NUMA 2258 * configuration of *supported* Opteron processors. They provide the same 2259 * information that may be gotten from the ACPI System Resource Affinity Table 2260 * (SRAT) if it exists on the machine of interest. 2261 * 2262 * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 2263 * of interest describes all of these registers and their contents. The main 2264 * registers used by this code to determine the NUMA configuration of the 2265 * machine are the node ID register for the number of NUMA nodes and the DRAM 2266 * address map registers for the physical address range of each node. 2267 * 2268 * NOTE: The format and how to determine the NUMA configuration using PCI 2269 * config space registers may change or may not be supported in future 2270 * Opteron processor families. 2271 */ 2272 2273 /* 2274 * How many bits to shift Opteron DRAM Address Map base and limit registers 2275 * to get actual value 2276 */ 2277 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 2278 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 2279 2280 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 2281 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 2282 2283 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 2284 2285 /* 2286 * Macros to derive addresses from Opteron DRAM Address Map registers 2287 */ 2288 #define OPT_DRAMADDR_HI(reg) \ 2289 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 2290 OPT_DRAMADDR_HI_LSHIFT_ADDR) 2291 2292 #define OPT_DRAMADDR_LO(reg) \ 2293 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 2294 OPT_DRAMADDR_LO_LSHIFT_ADDR) 2295 2296 #define OPT_DRAMADDR(high, low) \ 2297 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 2298 2299 /* 2300 * Bit masks defining what's in Opteron DRAM Address Map base register 2301 */ 2302 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 2303 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 2304 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 2305 2306 /* 2307 * Bit masks defining what's in Opteron DRAM Address Map limit register 2308 */ 2309 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 2310 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 2311 2312 2313 /* 2314 * Opteron Node ID register in PCI configuration space contains 2315 * number of nodes in system, etc. for Opteron K8. The following 2316 * constants and macros define its contents, structure, and access. 2317 */ 2318 2319 /* 2320 * Bit masks defining what's in Opteron Node ID register 2321 */ 2322 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 2323 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 2324 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 2325 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 2326 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 2327 2328 /* 2329 * How many bits in Opteron Node ID register to shift right to get actual value 2330 */ 2331 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 2332 2333 /* 2334 * Macros to get values from Opteron Node ID register 2335 */ 2336 #define OPT_NODE_CNT(reg) \ 2337 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 2338 2339 /* 2340 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 2341 * "in/out" instructions 2342 * 2343 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 2344 * other uses should just do MMIO to access PCI ECS. 2345 * Must enable special bit in Northbridge Configuration Register on 2346 * Greyhound for extended CF8 space access to be able to access PCI ECS 2347 * using "in/out" instructions and restore special bit after done 2348 * accessing PCI ECS. 2349 */ 2350 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 2351 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 2352 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 2353 ((((reg) >> 8) & 0xf) << 24)) 2354 2355 /* 2356 * PCI configuration space registers accessed by specifying 2357 * a bus, device, function, and offset. The following constants 2358 * define the values needed to access Opteron K8 configuration 2359 * info to determine its node topology 2360 */ 2361 2362 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 2363 2364 /* 2365 * Opteron PCI configuration space register function values 2366 */ 2367 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 2368 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 2369 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 2370 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 2371 2372 /* 2373 * PCI Configuration Space register offsets 2374 */ 2375 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 2376 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 2377 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 2378 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 2379 2380 /* 2381 * Opteron PCI Configuration Space device IDs for nodes 2382 */ 2383 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 2384 2385 2386 /* 2387 * Opteron DRAM address map gives base and limit for physical memory in a node 2388 */ 2389 typedef struct opt_dram_addr_map { 2390 uint32_t base_hi; 2391 uint32_t base_lo; 2392 uint32_t limit_hi; 2393 uint32_t limit_lo; 2394 } opt_dram_addr_map_t; 2395 2396 2397 /* 2398 * Supported AMD processor families 2399 */ 2400 #define AMD_FAMILY_HAMMER 15 2401 #define AMD_FAMILY_GREYHOUND 16 2402 2403 /* 2404 * Whether to have is_opteron() return 1 even when processor isn't supported 2405 */ 2406 uint_t is_opteron_override = 0; 2407 2408 /* 2409 * AMD processor family for current CPU 2410 */ 2411 uint_t opt_family = 0; 2412 2413 2414 /* 2415 * Determine whether we're running on a supported AMD Opteron since reading 2416 * node count and DRAM address map registers may have different format or 2417 * may not be supported across processor families 2418 */ 2419 static int 2420 is_opteron(void) 2421 { 2422 2423 if (x86_vendor != X86_VENDOR_AMD) 2424 return (0); 2425 2426 opt_family = cpuid_getfamily(CPU); 2427 if (opt_family == AMD_FAMILY_HAMMER || 2428 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 2429 return (1); 2430 else 2431 return (0); 2432 } 2433 2434 2435 /* 2436 * Determine NUMA configuration for Opteron from registers that live in PCI 2437 * configuration space 2438 */ 2439 static void 2440 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 2441 node_phys_addr_map_t *node_memory) 2442 { 2443 uint_t bus; 2444 uint_t dev; 2445 struct opt_dram_addr_map dram_map[MAX_NODES]; 2446 uint_t node; 2447 uint_t node_info[MAX_NODES]; 2448 uint_t off_hi; 2449 uint_t off_lo; 2450 uint64_t nb_cfg_reg; 2451 2452 /* 2453 * Read configuration registers from PCI configuration space to 2454 * determine node information, which memory is in each node, etc. 2455 * 2456 * Write to PCI configuration space address register to specify 2457 * which configuration register to read and read/write PCI 2458 * configuration space data register to get/set contents 2459 */ 2460 bus = OPT_PCS_BUS_CONFIG; 2461 dev = OPT_PCS_DEV_NODE0; 2462 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 2463 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 2464 2465 /* 2466 * Read node ID register for node 0 to get node count 2467 */ 2468 node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 2469 OPT_PCS_OFF_NODEID); 2470 *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 2471 2472 /* 2473 * If number of nodes is more than maximum supported, then set node 2474 * count to 1 and treat system as UMA instead of NUMA. 2475 */ 2476 if (*node_cnt > MAX_NODES) { 2477 *node_cnt = 1; 2478 return; 2479 } 2480 2481 /* 2482 * For Greyhound, PCI Extended Configuration Space must be enabled to 2483 * read high DRAM address map base and limit registers 2484 */ 2485 if (opt_family == AMD_FAMILY_GREYHOUND) { 2486 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 2487 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2488 wrmsr(MSR_AMD_NB_CFG, 2489 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 2490 } 2491 2492 for (node = 0; node < *node_cnt; node++) { 2493 uint32_t base_hi; 2494 uint32_t base_lo; 2495 uint32_t limit_hi; 2496 uint32_t limit_lo; 2497 2498 /* 2499 * Read node ID register (except for node 0 which we just read) 2500 */ 2501 if (node > 0) { 2502 node_info[node] = pci_getl_func(bus, dev, 2503 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 2504 } 2505 2506 /* 2507 * Read DRAM base and limit registers which specify 2508 * physical memory range of each node 2509 */ 2510 if (opt_family != AMD_FAMILY_GREYHOUND) 2511 base_hi = 0; 2512 else { 2513 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2514 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2515 base_hi = dram_map[node].base_hi = 2516 inl(PCI_CONFDATA); 2517 } 2518 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 2519 OPT_PCS_FUNC_ADDRMAP, off_lo); 2520 2521 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 2522 mem_intrlv) 2523 *mem_intrlv = *mem_intrlv + 1; 2524 2525 off_hi += 4; /* high limit register offset */ 2526 if (opt_family != AMD_FAMILY_GREYHOUND) 2527 limit_hi = 0; 2528 else { 2529 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2530 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2531 limit_hi = dram_map[node].limit_hi = 2532 inl(PCI_CONFDATA); 2533 } 2534 2535 off_lo += 4; /* low limit register offset */ 2536 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 2537 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 2538 2539 /* 2540 * Increment device number to next node and register offsets 2541 * for DRAM base register of next node 2542 */ 2543 off_hi += 4; 2544 off_lo += 4; 2545 dev++; 2546 2547 /* 2548 * Both read and write enable bits must be enabled in DRAM 2549 * address map base register for physical memory to exist in 2550 * node 2551 */ 2552 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 2553 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 2554 /* 2555 * Mark node memory as non-existent and set start and 2556 * end addresses to be same in node_memory[] 2557 */ 2558 node_memory[node].exists = 0; 2559 node_memory[node].start = node_memory[node].end = 2560 (pfn_t)-1; 2561 continue; 2562 } 2563 2564 /* 2565 * Mark node memory as existing and remember physical address 2566 * range of each node for use later 2567 */ 2568 node_memory[node].exists = 1; 2569 2570 node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 2571 2572 node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 2573 OPT_DRAMADDR_LO_MASK_OFF); 2574 } 2575 2576 /* 2577 * Restore PCI Extended Configuration Space enable bit 2578 */ 2579 if (opt_family == AMD_FAMILY_GREYHOUND) { 2580 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2581 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 2582 } 2583 } 2584 2585 2586 /* 2587 * Return average amount of time to read vendor ID register on Northbridge 2588 * N times on specified destination node from current CPU 2589 */ 2590 static hrtime_t 2591 opt_probe_vendor(int dest_node, int nreads) 2592 { 2593 int cnt; 2594 uint_t dev; 2595 /* LINTED: set but not used in function */ 2596 volatile uint_t dev_vendor; 2597 hrtime_t elapsed; 2598 hrtime_t end; 2599 int ipl; 2600 hrtime_t start; 2601 2602 dev = OPT_PCS_DEV_NODE0 + dest_node; 2603 kpreempt_disable(); 2604 ipl = spl8(); 2605 outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 2606 OPT_PCS_OFF_VENDOR)); 2607 start = gethrtime(); 2608 for (cnt = 0; cnt < nreads; cnt++) 2609 dev_vendor = inl(PCI_CONFDATA); 2610 end = gethrtime(); 2611 elapsed = (end - start) / nreads; 2612 splx(ipl); 2613 kpreempt_enable(); 2614 return (elapsed); 2615 } 2616