1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 32 * ================================================================ 33 * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 34 * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 35 * one or more CPUs and some local memory. The CPUs in each node can access 36 * the memory in the other nodes but at a higher latency than accessing their 37 * local memory. Typically, a system with only one node has Uniform Memory 38 * Access (UMA), but it may be possible to have a one node system that has 39 * some global memory outside of the node which is higher latency. 40 * 41 * Module Description 42 * ------------------ 43 * This module provides a platform interface for determining which CPUs and 44 * which memory (and how much) are in a NUMA node and how far each node is from 45 * each other. The interface is used by the Virtual Memory (VM) system and the 46 * common lgroup framework. The VM system uses the plat_*() routines to fill 47 * in its memory node (memnode) array with the physical address range spanned 48 * by each NUMA node to know which memory belongs to which node, so it can 49 * build and manage a physical page free list for each NUMA node and allocate 50 * local memory from each node as needed. The common lgroup framework uses the 51 * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 52 * to each node (leaf lgroup) and how far each node is from each other, so it 53 * can build the latency (lgroup) topology for the machine in order to optimize 54 * for locality. Also, an lgroup platform handle instead of lgroups are used 55 * in the interface with this module, so this module shouldn't need to know 56 * anything about lgroups. Instead, it just needs to know which CPUs, memory, 57 * etc. are in each NUMA node, how far each node is from each other, and to use 58 * a unique lgroup platform handle to refer to each node through the interface. 59 * 60 * Determining NUMA Configuration 61 * ------------------------------ 62 * By default, this module will try to determine the NUMA configuration of the 63 * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 64 * Locality Information Table (SLIT). The SRAT contains info to tell which 65 * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 66 * is a matrix that gives the distance between each system locality (which is 67 * a NUMA node and should correspond to proximity domains in the SRAT). For 68 * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 69 * specification. 70 * 71 * If the SRAT doesn't exist on a system with AMD Opteron processors, we 72 * examine registers in PCI configuration space to determine how many nodes are 73 * in the system and which CPUs and memory are in each node. 74 * do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is not guaranteed to work or be compatible across all 78 * Opteron processor families. 79 * 80 * If the SLIT does not exist or look right, the kernel will probe to determine 81 * the distance between nodes as long as the NUMA CPU and memory configuration 82 * has been determined (see lgrp_plat_probe() for details). 83 * 84 * Data Structures 85 * --------------- 86 * The main data structures used by this code are the following: 87 * 88 * - lgrp_plat_cpu_node[] APIC ID to node ID mapping table 89 * indexed by hashed APIC ID (only used 90 * for SRAT) 91 * 92 * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 93 * different nodes indexed by node ID 94 * 95 * - lgrp_plat_node_cnt Number of NUMA nodes in system 96 * 97 * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 98 * table indexed by node ID (only used 99 * for SRAT) 100 * 101 * - lgrp_plat_node_memory[] Table with physical address range for 102 * each node indexed by node ID 103 * 104 * The code is implemented to make the following always be true: 105 * 106 * lgroup platform handle == node ID == memnode ID 107 * 108 * Moreover, it allows for the proximity domain ID to be equal to all of the 109 * above as long as the proximity domains IDs are numbered from 0 to <number of 110 * nodes - 1>. This is done by hashing each proximity domain ID into the range 111 * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 112 * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 113 * and be assigned node ID N. If the proximity domain IDs aren't numbered 114 * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 115 * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 116 * to node IDs. However, the proximity domain IDs may not map to the 117 * equivalent node ID since we want to keep the node IDs numbered from 0 to 118 * <number of nodes - 1> to minimize cost of searching and potentially space. 119 */ 120 121 122 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 123 #include <sys/cmn_err.h> 124 #include <sys/controlregs.h> 125 #include <sys/cpupart.h> 126 #include <sys/cpuvar.h> 127 #include <sys/lgrp.h> 128 #include <sys/machsystm.h> 129 #include <sys/memlist.h> 130 #include <sys/memnode.h> 131 #include <sys/mman.h> 132 #include <sys/pci_cfgspace.h> 133 #include <sys/pci_impl.h> 134 #include <sys/param.h> 135 #include <sys/pghw.h> 136 #include <sys/promif.h> /* for prom_printf() */ 137 #include <sys/sysmacros.h> 138 #include <sys/systm.h> 139 #include <sys/thread.h> 140 #include <sys/types.h> 141 #include <sys/var.h> 142 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 143 #include <vm/hat_i86.h> 144 #include <vm/seg_kmem.h> 145 #include <vm/vm_dep.h> 146 147 #include "acpi_fw.h" /* for SRAT and SLIT */ 148 149 150 #define MAX_NODES 8 151 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 152 153 /* 154 * Constants for configuring probing 155 */ 156 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 157 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 158 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 159 160 /* 161 * Flags for probing 162 */ 163 #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 164 #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 165 #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 166 167 /* 168 * Hash CPU APIC ID into CPU to node mapping table using max_ncpus 169 * to minimize span of entries used 170 */ 171 #define CPU_NODE_HASH(apicid) ((apicid) % max_ncpus) 172 173 /* 174 * Hash proximity domain ID into node to domain mapping table using to minimize 175 * span of entries used 176 */ 177 #define NODE_DOMAIN_HASH(domain) ((domain) % lgrp_plat_node_cnt) 178 179 180 /* 181 * CPU APIC ID to node ID mapping structure (only used with SRAT) 182 */ 183 typedef struct cpu_node_map { 184 int exists; 185 uint_t node; 186 uint32_t apicid; 187 uint32_t prox_domain; 188 } cpu_node_map_t; 189 190 /* 191 * Latency statistics 192 */ 193 typedef struct lgrp_plat_latency_stats { 194 hrtime_t latencies[MAX_NODES][MAX_NODES]; 195 hrtime_t latency_max; 196 hrtime_t latency_min; 197 } lgrp_plat_latency_stats_t; 198 199 /* 200 * Memory configuration for probing 201 */ 202 typedef struct lgrp_plat_probe_mem_config { 203 size_t probe_memsize; /* how much memory to probe per node */ 204 caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 205 pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 206 } lgrp_plat_probe_mem_config_t; 207 208 /* 209 * Statistics kept for probing 210 */ 211 typedef struct lgrp_plat_probe_stats { 212 hrtime_t flush_cost; 213 hrtime_t probe_cost; 214 hrtime_t probe_cost_total; 215 hrtime_t probe_error_code; 216 hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 217 int probe_suspect[MAX_NODES][MAX_NODES]; 218 hrtime_t probe_max[MAX_NODES][MAX_NODES]; 219 hrtime_t probe_min[MAX_NODES][MAX_NODES]; 220 } lgrp_plat_probe_stats_t; 221 222 /* 223 * Node to proximity domain ID mapping structure (only used with SRAT) 224 */ 225 typedef struct node_domain_map { 226 int exists; 227 uint32_t prox_domain; 228 } node_domain_map_t; 229 230 /* 231 * Node ID and starting and ending page for physical memory in node 232 */ 233 typedef struct node_phys_addr_map { 234 pfn_t start; 235 pfn_t end; 236 int exists; 237 uint32_t prox_domain; 238 } node_phys_addr_map_t; 239 240 241 /* 242 * CPU APIC ID to node ID mapping table (only used for SRAT) 243 */ 244 static cpu_node_map_t lgrp_plat_cpu_node[NCPU]; 245 246 /* 247 * Latency statistics 248 */ 249 lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 250 251 /* 252 * Whether memory is interleaved across nodes causing MPO to be disabled 253 */ 254 static int lgrp_plat_mem_intrlv = 0; 255 256 /* 257 * Node ID to proximity domain ID mapping table (only used for SRAT) 258 */ 259 static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 260 261 /* 262 * Physical address range for memory in each node 263 */ 264 static node_phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 265 266 /* 267 * Statistics gotten from probing 268 */ 269 static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 270 271 /* 272 * Memory configuration for probing 273 */ 274 static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 275 276 /* 277 * Error code from processing ACPI SRAT 278 */ 279 static int lgrp_plat_srat_error = 0; 280 281 /* 282 * Error code from processing ACPI SLIT 283 */ 284 static int lgrp_plat_slit_error = 0; 285 286 /* 287 * Allocate lgroup array statically 288 */ 289 static lgrp_t lgrp_space[NLGRP]; 290 static int nlgrps_alloc; 291 292 293 /* 294 * Number of nodes in system 295 */ 296 uint_t lgrp_plat_node_cnt = 1; 297 298 /* 299 * Configuration Parameters for Probing 300 * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 301 * operation, etc. 302 * - lgrp_plat_probe_nrounds How many rounds of probing to do 303 * - lgrp_plat_probe_nsamples Number of samples to take when probing each 304 * node 305 * - lgrp_plat_probe_nreads Number of times to read vendor ID from 306 * Northbridge for each probe 307 */ 308 uint_t lgrp_plat_probe_flags = 0; 309 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 310 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 311 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 312 313 /* 314 * Enable use of ACPI System Resource Affinity Table (SRAT) and System 315 * Locality Information Table (SLIT) 316 */ 317 int lgrp_plat_srat_enable = 1; 318 int lgrp_plat_slit_enable = 1; 319 320 /* 321 * Static array to hold lgroup statistics 322 */ 323 struct lgrp_stats lgrp_stats[NLGRP]; 324 325 326 /* 327 * Forward declarations of platform interface routines 328 */ 329 void plat_build_mem_nodes(struct memlist *list); 330 331 int plat_lgrphand_to_mem_node(lgrp_handle_t hand); 332 333 lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 334 335 int plat_mnode_xcheck(pfn_t pfncnt); 336 337 int plat_pfn_to_mem_node(pfn_t pfn); 338 339 /* 340 * Forward declarations of lgroup platform interface routines 341 */ 342 lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 343 344 void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 345 346 lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 347 348 void lgrp_plat_init(void); 349 350 int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 351 352 void lgrp_plat_main_init(void); 353 354 int lgrp_plat_max_lgrps(void); 355 356 pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 357 lgrp_mem_query_t query); 358 359 lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 360 361 void lgrp_plat_probe(void); 362 363 lgrp_handle_t lgrp_plat_root_hand(void); 364 365 366 /* 367 * Forward declarations of local routines 368 */ 369 static int is_opteron(void); 370 371 static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node); 372 373 static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 374 uint32_t domain); 375 376 static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 377 lgrp_plat_latency_stats_t *lat_stats, 378 lgrp_plat_probe_stats_t *probe_stats); 379 380 static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 381 lgrp_plat_latency_stats_t *lat_stats); 382 383 static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 384 385 static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 386 uint32_t domain); 387 388 static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 389 node_phys_addr_map_t *node_memory, uintptr_t start, uintptr_t end, 390 uint32_t domain); 391 392 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 393 lgrp_plat_probe_mem_config_t *probe_mem_config, 394 lgrp_plat_latency_stats_t *lat_stats, 395 lgrp_plat_probe_stats_t *probe_stats); 396 397 static int lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 398 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); 399 400 static int lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt, 401 node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, 402 node_phys_addr_map_t *node_memory); 403 404 static int lgrp_plat_srat_domains(struct srat *tp); 405 406 static void lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 407 lgrp_plat_latency_stats_t *lat_stats); 408 409 static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 410 node_phys_addr_map_t *node_memory); 411 412 static hrtime_t opt_probe_vendor(int dest_node, int nreads); 413 414 415 /* 416 * PLATFORM INTERFACE ROUTINES 417 */ 418 419 /* 420 * Configure memory nodes for machines with more than one node (ie NUMA) 421 */ 422 void 423 plat_build_mem_nodes(struct memlist *list) 424 { 425 pfn_t cur_start; /* start addr of subrange */ 426 pfn_t cur_end; /* end addr of subrange */ 427 pfn_t start; /* start addr of whole range */ 428 pfn_t end; /* end addr of whole range */ 429 430 /* 431 * Boot install lists are arranged <addr, len>, ... 432 */ 433 while (list) { 434 int node; 435 436 start = list->address >> PAGESHIFT; 437 end = (list->address + list->size - 1) >> PAGESHIFT; 438 439 if (start > physmax) { 440 list = list->next; 441 continue; 442 } 443 if (end > physmax) 444 end = physmax; 445 446 /* 447 * When there is only one memnode, just add memory to memnode 448 */ 449 if (max_mem_nodes == 1) { 450 mem_node_add_slice(start, end); 451 list = list->next; 452 continue; 453 } 454 455 /* 456 * mem_node_add_slice() expects to get a memory range that 457 * is within one memnode, so need to split any memory range 458 * that spans multiple memnodes into subranges that are each 459 * contained within one memnode when feeding them to 460 * mem_node_add_slice() 461 */ 462 cur_start = start; 463 do { 464 node = plat_pfn_to_mem_node(cur_start); 465 466 /* 467 * Panic if DRAM address map registers or SRAT say 468 * memory in node doesn't exist or address from 469 * boot installed memory list entry isn't in this node. 470 * This shouldn't happen and rest of code can't deal 471 * with this if it does. 472 */ 473 if (node < 0 || node >= lgrp_plat_node_cnt || 474 !lgrp_plat_node_memory[node].exists || 475 cur_start < lgrp_plat_node_memory[node].start || 476 cur_start > lgrp_plat_node_memory[node].end) { 477 cmn_err(CE_PANIC, "Don't know which memnode " 478 "to add installed memory address 0x%lx\n", 479 cur_start); 480 } 481 482 /* 483 * End of current subrange should not span memnodes 484 */ 485 cur_end = end; 486 if (lgrp_plat_node_memory[node].exists && 487 cur_end > lgrp_plat_node_memory[node].end) 488 cur_end = lgrp_plat_node_memory[node].end; 489 490 mem_node_add_slice(cur_start, cur_end); 491 492 /* 493 * Next subrange starts after end of current one 494 */ 495 cur_start = cur_end + 1; 496 } while (cur_end < end); 497 498 list = list->next; 499 } 500 mem_node_physalign = 0; 501 mem_node_pfn_shift = 0; 502 } 503 504 505 int 506 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 507 { 508 if (max_mem_nodes == 1) 509 return (0); 510 511 return ((int)hand); 512 } 513 514 515 /* 516 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 517 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 518 * a crossing is found and returns 0 otherwise. 519 */ 520 int 521 plat_mnode_xcheck(pfn_t pfncnt) 522 { 523 int node, prevnode = -1, basenode; 524 pfn_t ea, sa; 525 526 for (node = 0; node < lgrp_plat_node_cnt; node++) { 527 528 if (lgrp_plat_node_memory[node].exists == 0) 529 continue; 530 531 if (prevnode == -1) { 532 prevnode = node; 533 basenode = node; 534 continue; 535 } 536 537 /* assume x86 node pfn ranges are in increasing order */ 538 ASSERT(lgrp_plat_node_memory[node].start > 539 lgrp_plat_node_memory[prevnode].end); 540 541 /* 542 * continue if the starting address of node is not contiguous 543 * with the previous node. 544 */ 545 546 if (lgrp_plat_node_memory[node].start != 547 (lgrp_plat_node_memory[prevnode].end + 1)) { 548 basenode = node; 549 prevnode = node; 550 continue; 551 } 552 553 /* check if the starting address of node is pfncnt aligned */ 554 if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { 555 556 /* 557 * at this point, node starts at an unaligned boundary 558 * and is contiguous with the previous node(s) to 559 * basenode. Check if there is an aligned contiguous 560 * range of length pfncnt that crosses this boundary. 561 */ 562 563 sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, 564 pfncnt); 565 ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), 566 pfncnt); 567 568 ASSERT((ea - sa) == pfncnt); 569 if (sa >= lgrp_plat_node_memory[basenode].start && 570 ea <= (lgrp_plat_node_memory[node].end + 1)) 571 return (1); 572 } 573 prevnode = node; 574 } 575 return (0); 576 } 577 578 579 lgrp_handle_t 580 plat_mem_node_to_lgrphand(int mnode) 581 { 582 if (max_mem_nodes == 1) 583 return (LGRP_DEFAULT_HANDLE); 584 585 return ((lgrp_handle_t)mnode); 586 } 587 588 589 int 590 plat_pfn_to_mem_node(pfn_t pfn) 591 { 592 int node; 593 594 if (max_mem_nodes == 1) 595 return (0); 596 597 for (node = 0; node < lgrp_plat_node_cnt; node++) { 598 /* 599 * Skip nodes with no memory 600 */ 601 if (!lgrp_plat_node_memory[node].exists) 602 continue; 603 604 if (pfn >= lgrp_plat_node_memory[node].start && 605 pfn <= lgrp_plat_node_memory[node].end) 606 return (node); 607 } 608 609 /* 610 * Didn't find memnode where this PFN lives which should never happen 611 */ 612 ASSERT(node < lgrp_plat_node_cnt); 613 return (-1); 614 } 615 616 617 /* 618 * LGROUP PLATFORM INTERFACE ROUTINES 619 */ 620 621 /* 622 * Allocate additional space for an lgroup. 623 */ 624 /* ARGSUSED */ 625 lgrp_t * 626 lgrp_plat_alloc(lgrp_id_t lgrpid) 627 { 628 lgrp_t *lgrp; 629 630 lgrp = &lgrp_space[nlgrps_alloc++]; 631 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 632 return (NULL); 633 return (lgrp); 634 } 635 636 637 /* 638 * Platform handling for (re)configuration changes 639 */ 640 /* ARGSUSED */ 641 void 642 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 643 { 644 } 645 646 647 /* 648 * Return the platform handle for the lgroup containing the given CPU 649 */ 650 /* ARGSUSED */ 651 lgrp_handle_t 652 lgrp_plat_cpu_to_hand(processorid_t id) 653 { 654 lgrp_handle_t hand; 655 656 if (lgrp_plat_node_cnt == 1) 657 return (LGRP_DEFAULT_HANDLE); 658 659 hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 660 lgrp_plat_cpu_node); 661 662 ASSERT(hand != (lgrp_handle_t)-1); 663 if (hand == (lgrp_handle_t)-1) 664 return (LGRP_NULL_HANDLE); 665 666 return (hand); 667 } 668 669 670 /* 671 * Platform-specific initialization of lgroups 672 */ 673 void 674 lgrp_plat_init(void) 675 { 676 #if defined(__xpv) 677 /* 678 * XXPV For now, the hypervisor treats all memory equally. 679 */ 680 lgrp_plat_node_cnt = max_mem_nodes = 1; 681 #else /* __xpv */ 682 uint_t probe_op; 683 684 /* 685 * Initialize as a UMA machine 686 */ 687 if (lgrp_topo_ht_limit() == 1) { 688 lgrp_plat_node_cnt = max_mem_nodes = 1; 689 return; 690 } 691 692 /* 693 * Determine which CPUs and memory are local to each other and number 694 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 695 */ 696 lgrp_plat_srat_error = lgrp_plat_process_srat(srat_ptr, 697 &lgrp_plat_node_cnt, lgrp_plat_node_domain, lgrp_plat_cpu_node, 698 lgrp_plat_node_memory); 699 700 /* 701 * Try to use PCI config space registers on Opteron if SRAT doesn't 702 * exist or there is some error processing the SRAT 703 */ 704 if (lgrp_plat_srat_error != 0 && is_opteron()) 705 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 706 lgrp_plat_node_memory); 707 708 /* 709 * Don't bother to setup system for multiple lgroups and only use one 710 * memory node when memory is interleaved between any nodes or there is 711 * only one NUMA node 712 * 713 * NOTE: May need to change this for Dynamic Reconfiguration (DR) 714 * when and if it happens for x86/x64 715 */ 716 if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 717 lgrp_plat_node_cnt = max_mem_nodes = 1; 718 (void) lgrp_topo_ht_limit_set(1); 719 return; 720 } 721 722 /* 723 * Leaf lgroups on x86/x64 architectures contain one physical 724 * processor chip. Tune lgrp_expand_proc_thresh and 725 * lgrp_expand_proc_diff so that lgrp_choose() will spread 726 * things out aggressively. 727 */ 728 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 729 lgrp_expand_proc_diff = 0; 730 731 /* 732 * There should be one memnode (physical page free list(s)) for 733 * each node 734 */ 735 max_mem_nodes = lgrp_plat_node_cnt; 736 737 /* 738 * Determine how far each NUMA node is from each other by 739 * reading ACPI System Locality Information Table (SLIT) if it 740 * exists 741 */ 742 lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 743 lgrp_plat_node_cnt, lgrp_plat_node_memory, 744 &lgrp_plat_lat_stats); 745 if (lgrp_plat_slit_error == 0) 746 return; 747 748 /* 749 * Probe to determine latency between NUMA nodes when SLIT 750 * doesn't exist or make sense 751 */ 752 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 753 754 /* 755 * Specify whether to probe using vendor ID register or page copy 756 * if hasn't been specified already or is overspecified 757 */ 758 probe_op = lgrp_plat_probe_flags & 759 (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 760 761 if (probe_op == 0 || 762 probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 763 lgrp_plat_probe_flags &= 764 ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 765 if (is_opteron()) 766 lgrp_plat_probe_flags |= 767 LGRP_PLAT_PROBE_VENDOR; 768 else 769 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 770 } 771 772 /* 773 * Probing errors can mess up the lgroup topology and 774 * force us fall back to a 2 level lgroup topology. 775 * Here we bound how tall the lgroup topology can grow 776 * in hopes of avoiding any anamolies in probing from 777 * messing up the lgroup topology by limiting the 778 * accuracy of the latency topology. 779 * 780 * Assume that nodes will at least be configured in a 781 * ring, so limit height of lgroup topology to be less 782 * than number of nodes on a system with 4 or more 783 * nodes 784 */ 785 if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 786 lgrp_topo_ht_limit_default()) 787 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 788 #endif /* __xpv */ 789 } 790 791 792 /* 793 * Return latency between "from" and "to" lgroups 794 * 795 * This latency number can only be used for relative comparison 796 * between lgroups on the running system, cannot be used across platforms, 797 * and may not reflect the actual latency. It is platform and implementation 798 * specific, so platform gets to decide its value. It would be nice if the 799 * number was at least proportional to make comparisons more meaningful though. 800 */ 801 /* ARGSUSED */ 802 int 803 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 804 { 805 lgrp_handle_t src, dest; 806 int node; 807 808 if (max_mem_nodes == 1) 809 return (0); 810 811 /* 812 * Return max latency for root lgroup 813 */ 814 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 815 return (lgrp_plat_lat_stats.latency_max); 816 817 src = from; 818 dest = to; 819 820 /* 821 * Return 0 for nodes (lgroup platform handles) out of range 822 */ 823 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 824 return (0); 825 826 /* 827 * Probe from current CPU if its lgroup latencies haven't been set yet 828 * and we are trying to get latency from current CPU to some node 829 */ 830 node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 831 ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 832 if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) 833 lgrp_plat_probe(); 834 835 return (lgrp_plat_lat_stats.latencies[src][dest]); 836 } 837 838 839 /* 840 * Platform-specific initialization 841 */ 842 void 843 lgrp_plat_main_init(void) 844 { 845 int curnode; 846 int ht_limit; 847 int i; 848 849 /* 850 * Print a notice that MPO is disabled when memory is interleaved 851 * across nodes....Would do this when it is discovered, but can't 852 * because it happens way too early during boot.... 853 */ 854 if (lgrp_plat_mem_intrlv) 855 cmn_err(CE_NOTE, 856 "MPO disabled because memory is interleaved\n"); 857 858 /* 859 * Don't bother to do any probing if it is disabled, there is only one 860 * node, or the height of the lgroup topology less than or equal to 2 861 */ 862 ht_limit = lgrp_topo_ht_limit(); 863 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 864 max_mem_nodes == 1 || ht_limit <= 2) { 865 /* 866 * Setup lgroup latencies for 2 level lgroup topology 867 * (ie. local and remote only) if they haven't been set yet 868 */ 869 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 870 lgrp_plat_lat_stats.latency_max == 0) 871 lgrp_plat_2level_setup(lgrp_plat_node_memory, 872 &lgrp_plat_lat_stats); 873 return; 874 } 875 876 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 877 /* 878 * Should have been able to probe from CPU 0 when it was added 879 * to lgroup hierarchy, but may not have been able to then 880 * because it happens so early in boot that gethrtime() hasn't 881 * been initialized. (:-( 882 */ 883 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 884 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 885 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 886 lgrp_plat_probe(); 887 888 return; 889 } 890 891 /* 892 * When probing memory, use one page for every sample to determine 893 * lgroup topology and taking multiple samples 894 */ 895 if (lgrp_plat_probe_mem_config.probe_memsize == 0) 896 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 897 lgrp_plat_probe_nsamples; 898 899 /* 900 * Map memory in each node needed for probing to determine latency 901 * topology 902 */ 903 for (i = 0; i < lgrp_plat_node_cnt; i++) { 904 int mnode; 905 906 /* 907 * Skip this node and leave its probe page NULL 908 * if it doesn't have any memory 909 */ 910 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 911 if (!mem_node_config[mnode].exists) { 912 lgrp_plat_probe_mem_config.probe_va[i] = NULL; 913 continue; 914 } 915 916 /* 917 * Allocate one kernel virtual page 918 */ 919 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 920 lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 921 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 922 cmn_err(CE_WARN, 923 "lgrp_plat_main_init: couldn't allocate memory"); 924 return; 925 } 926 927 /* 928 * Get PFN for first page in each node 929 */ 930 lgrp_plat_probe_mem_config.probe_pfn[i] = 931 mem_node_config[mnode].physbase; 932 933 /* 934 * Map virtual page to first page in node 935 */ 936 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 937 lgrp_plat_probe_mem_config.probe_memsize, 938 lgrp_plat_probe_mem_config.probe_pfn[i], 939 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 940 HAT_LOAD_NOCONSIST); 941 } 942 943 /* 944 * Probe from current CPU 945 */ 946 lgrp_plat_probe(); 947 } 948 949 950 /* 951 * Return the maximum number of lgrps supported by the platform. 952 * Before lgrp topology is known it returns an estimate based on the number of 953 * nodes. Once topology is known it returns the actual maximim number of lgrps 954 * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and 955 * dynamic addition of new nodes, this number may not grow during system 956 * lifetime (yet). 957 */ 958 int 959 lgrp_plat_max_lgrps(void) 960 { 961 return (lgrp_topo_initialized ? 962 lgrp_alloc_max + 1 : 963 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 964 } 965 966 967 /* 968 * Return the number of free pages in an lgroup. 969 * 970 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 971 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 972 * number of allocatable base pagesize pages corresponding to the 973 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 974 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 975 * memory installed, regardless of whether or not it's usable. 976 */ 977 pgcnt_t 978 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 979 { 980 int mnode; 981 pgcnt_t npgs = (pgcnt_t)0; 982 extern struct memlist *phys_avail; 983 extern struct memlist *phys_install; 984 985 986 if (plathand == LGRP_DEFAULT_HANDLE) 987 return (lgrp_plat_mem_size_default(plathand, query)); 988 989 if (plathand != LGRP_NULL_HANDLE) { 990 mnode = plat_lgrphand_to_mem_node(plathand); 991 if (mnode >= 0 && mem_node_config[mnode].exists) { 992 switch (query) { 993 case LGRP_MEM_SIZE_FREE: 994 npgs = MNODE_PGCNT(mnode); 995 break; 996 case LGRP_MEM_SIZE_AVAIL: 997 npgs = mem_node_memlist_pages(mnode, 998 phys_avail); 999 break; 1000 case LGRP_MEM_SIZE_INSTALL: 1001 npgs = mem_node_memlist_pages(mnode, 1002 phys_install); 1003 break; 1004 default: 1005 break; 1006 } 1007 } 1008 } 1009 return (npgs); 1010 } 1011 1012 1013 /* 1014 * Return the platform handle of the lgroup that contains the physical memory 1015 * corresponding to the given page frame number 1016 */ 1017 /* ARGSUSED */ 1018 lgrp_handle_t 1019 lgrp_plat_pfn_to_hand(pfn_t pfn) 1020 { 1021 int mnode; 1022 1023 if (max_mem_nodes == 1) 1024 return (LGRP_DEFAULT_HANDLE); 1025 1026 if (pfn > physmax) 1027 return (LGRP_NULL_HANDLE); 1028 1029 mnode = plat_pfn_to_mem_node(pfn); 1030 if (mnode < 0) 1031 return (LGRP_NULL_HANDLE); 1032 1033 return (MEM_NODE_2_LGRPHAND(mnode)); 1034 } 1035 1036 1037 /* 1038 * Probe memory in each node from current CPU to determine latency topology 1039 * 1040 * The probing code will probe the vendor ID register on the Northbridge of 1041 * Opteron processors and probe memory for other processors by default. 1042 * 1043 * Since probing is inherently error prone, the code takes laps across all the 1044 * nodes probing from each node to each of the other nodes some number of 1045 * times. Furthermore, each node is probed some number of times before moving 1046 * onto the next one during each lap. The minimum latency gotten between nodes 1047 * is kept as the latency between the nodes. 1048 * 1049 * After all that, the probe times are adjusted by normalizing values that are 1050 * close to each other and local latencies are made the same. Lastly, the 1051 * latencies are verified to make sure that certain conditions are met (eg. 1052 * local < remote, latency(a, b) == latency(b, a), etc.). 1053 * 1054 * If any of the conditions aren't met, the code will export a NUMA 1055 * configuration with the local CPUs and memory given by the SRAT or PCI config 1056 * space registers and one remote memory latency since it can't tell exactly 1057 * how far each node is from each other. 1058 */ 1059 void 1060 lgrp_plat_probe(void) 1061 { 1062 int from; 1063 int i; 1064 lgrp_plat_latency_stats_t *lat_stats; 1065 hrtime_t probe_time; 1066 int to; 1067 1068 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1069 max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1070 return; 1071 1072 /* 1073 * Determine ID of node containing current CPU 1074 */ 1075 from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 1076 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1077 if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 1078 ASSERT(lgrp_plat_node_domain[from].exists); 1079 1080 /* 1081 * Don't need to probe if got times already 1082 */ 1083 lat_stats = &lgrp_plat_lat_stats; 1084 if (lat_stats->latencies[from][from] != 0) 1085 return; 1086 1087 /* 1088 * Read vendor ID in Northbridge or read and write page(s) 1089 * in each node from current CPU and remember how long it takes, 1090 * so we can build latency topology of machine later. 1091 * This should approximate the memory latency between each node. 1092 */ 1093 for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1094 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1095 /* 1096 * Get probe time and bail out if can't get it yet 1097 */ 1098 probe_time = lgrp_plat_probe_time(to, 1099 lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config, 1100 &lgrp_plat_lat_stats, &lgrp_plat_probe_stats); 1101 if (probe_time == 0) 1102 return; 1103 1104 /* 1105 * Keep lowest probe time as latency between nodes 1106 */ 1107 if (lat_stats->latencies[from][to] == 0 || 1108 probe_time < lat_stats->latencies[from][to]) 1109 lat_stats->latencies[from][to] = probe_time; 1110 1111 /* 1112 * Update overall minimum and maximum probe times 1113 * across all nodes 1114 */ 1115 if (probe_time < lat_stats->latency_min || 1116 lat_stats->latency_min == -1) 1117 lat_stats->latency_min = probe_time; 1118 if (probe_time > lat_stats->latency_max) 1119 lat_stats->latency_max = probe_time; 1120 } 1121 } 1122 1123 /* 1124 * - Fix up latencies such that local latencies are same, 1125 * latency(i, j) == latency(j, i), etc. (if possible) 1126 * 1127 * - Verify that latencies look ok 1128 * 1129 * - Fallback to just optimizing for local and remote if 1130 * latencies didn't look right 1131 */ 1132 lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats, 1133 &lgrp_plat_probe_stats); 1134 lgrp_plat_probe_stats.probe_error_code = 1135 lgrp_plat_latency_verify(lgrp_plat_node_memory, 1136 &lgrp_plat_lat_stats); 1137 if (lgrp_plat_probe_stats.probe_error_code) 1138 lgrp_plat_2level_setup(lgrp_plat_node_memory, 1139 &lgrp_plat_lat_stats); 1140 } 1141 1142 1143 /* 1144 * Return platform handle for root lgroup 1145 */ 1146 lgrp_handle_t 1147 lgrp_plat_root_hand(void) 1148 { 1149 return (LGRP_DEFAULT_HANDLE); 1150 } 1151 1152 1153 /* 1154 * INTERNAL ROUTINES 1155 */ 1156 1157 1158 /* 1159 * Update CPU to node mapping for given CPU and proximity domain (and returns 1160 * negative numbers for errors and positive ones for success) 1161 */ 1162 static int 1163 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 1164 cpu_node_map_t *cpu_node, uint32_t apicid, uint32_t domain) 1165 { 1166 uint_t i; 1167 uint_t start; 1168 int node; 1169 1170 /* 1171 * Get node number for proximity domain 1172 */ 1173 node = lgrp_plat_domain_to_node(node_domain, domain); 1174 if (node == -1) { 1175 node = lgrp_plat_node_domain_update(node_domain, domain); 1176 if (node == -1) 1177 return (-1); 1178 } 1179 1180 /* 1181 * Hash given CPU APIC ID into CPU to node mapping table/array and 1182 * enter it and its corresponding node and proximity domain IDs into 1183 * first non-existent or matching entry 1184 */ 1185 i = start = CPU_NODE_HASH(apicid); 1186 do { 1187 if (cpu_node[i].exists) { 1188 /* 1189 * Update already existing entry for CPU 1190 */ 1191 if (cpu_node[i].apicid == apicid) { 1192 /* 1193 * Just return when everything same 1194 */ 1195 if (cpu_node[i].prox_domain == domain && 1196 cpu_node[i].node == node) 1197 return (1); 1198 1199 /* 1200 * Assert that proximity domain and node IDs 1201 * should be same and return error on non-debug 1202 * kernel 1203 */ 1204 ASSERT(cpu_node[i].prox_domain == domain && 1205 cpu_node[i].node == node); 1206 return (-1); 1207 } 1208 } else { 1209 /* 1210 * Create new entry for CPU 1211 */ 1212 cpu_node[i].exists = 1; 1213 cpu_node[i].apicid = apicid; 1214 cpu_node[i].prox_domain = domain; 1215 cpu_node[i].node = node; 1216 return (0); 1217 } 1218 i = CPU_NODE_HASH(i + 1); 1219 } while (i != start); 1220 1221 /* 1222 * Ran out of supported number of entries which shouldn't happen.... 1223 */ 1224 ASSERT(i != start); 1225 return (-1); 1226 } 1227 1228 1229 /* 1230 * Get node ID for given CPU ID 1231 */ 1232 static int 1233 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node) 1234 { 1235 uint32_t apicid; 1236 uint_t i; 1237 uint_t start; 1238 1239 if (cp == NULL) 1240 return (-1); 1241 1242 /* 1243 * SRAT doesn't exist, isn't enabled, or there was an error processing 1244 * it, so return chip ID for Opteron and -1 otherwise. 1245 */ 1246 if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1247 lgrp_plat_srat_error) { 1248 if (is_opteron()) 1249 return (pg_plat_hw_instance_id(cp, PGHW_CHIP)); 1250 return (-1); 1251 } 1252 1253 /* 1254 * SRAT does exist, so get APIC ID for given CPU and map that to its 1255 * node ID 1256 */ 1257 apicid = cpuid_get_apicid(cp); 1258 i = start = CPU_NODE_HASH(apicid); 1259 do { 1260 if (cpu_node[i].apicid == apicid && cpu_node[i].exists) 1261 return (cpu_node[i].node); 1262 i = CPU_NODE_HASH(i + 1); 1263 } while (i != start); 1264 return (-1); 1265 } 1266 1267 1268 /* 1269 * Return node number for given proximity domain/system locality 1270 */ 1271 static int 1272 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, uint32_t domain) 1273 { 1274 uint_t node; 1275 uint_t start; 1276 1277 /* 1278 * Hash proximity domain ID into node to domain mapping table (array), 1279 * search for entry with matching proximity domain ID, and return index 1280 * of matching entry as node ID. 1281 */ 1282 node = start = NODE_DOMAIN_HASH(domain); 1283 do { 1284 if (node_domain[node].prox_domain == domain && 1285 node_domain[node].exists) 1286 return (node); 1287 node = NODE_DOMAIN_HASH(node + 1); 1288 } while (node != start); 1289 return (-1); 1290 } 1291 1292 1293 /* 1294 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1295 * be considered same 1296 */ 1297 #define LGRP_LAT_TOLERANCE_SHIFT 4 1298 1299 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1300 1301 1302 /* 1303 * Adjust latencies between nodes to be symmetric, normalize latencies between 1304 * any nodes that are within some tolerance to be same, and make local 1305 * latencies be same 1306 */ 1307 static void 1308 lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 1309 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1310 { 1311 int i; 1312 int j; 1313 int k; 1314 int l; 1315 u_longlong_t max; 1316 u_longlong_t min; 1317 u_longlong_t t; 1318 u_longlong_t t1; 1319 u_longlong_t t2; 1320 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1321 int lat_corrected[MAX_NODES][MAX_NODES]; 1322 1323 /* 1324 * Nothing to do when this is an UMA machine or don't have args needed 1325 */ 1326 if (max_mem_nodes == 1) 1327 return; 1328 1329 ASSERT(node_memory != NULL && lat_stats != NULL && 1330 probe_stats != NULL); 1331 1332 /* 1333 * Make sure that latencies are symmetric between any two nodes 1334 * (ie. latency(node0, node1) == latency(node1, node0)) 1335 */ 1336 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1337 if (!node_memory[i].exists) 1338 continue; 1339 1340 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1341 if (!node_memory[j].exists) 1342 continue; 1343 1344 t1 = lat_stats->latencies[i][j]; 1345 t2 = lat_stats->latencies[j][i]; 1346 1347 if (t1 == 0 || t2 == 0 || t1 == t2) 1348 continue; 1349 1350 /* 1351 * Latencies should be same 1352 * - Use minimum of two latencies which should be same 1353 * - Track suspect probe times not within tolerance of 1354 * min value 1355 * - Remember how much values are corrected by 1356 */ 1357 if (t1 > t2) { 1358 t = t2; 1359 probe_stats->probe_errors[i][j] += t1 - t2; 1360 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1361 probe_stats->probe_suspect[i][j]++; 1362 probe_stats->probe_suspect[j][i]++; 1363 } 1364 } else if (t2 > t1) { 1365 t = t1; 1366 probe_stats->probe_errors[j][i] += t2 - t1; 1367 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1368 probe_stats->probe_suspect[i][j]++; 1369 probe_stats->probe_suspect[j][i]++; 1370 } 1371 } 1372 1373 lat_stats->latencies[i][j] = 1374 lat_stats->latencies[j][i] = t; 1375 lgrp_config(cflag, t1, t); 1376 lgrp_config(cflag, t2, t); 1377 } 1378 } 1379 1380 /* 1381 * Keep track of which latencies get corrected 1382 */ 1383 for (i = 0; i < MAX_NODES; i++) 1384 for (j = 0; j < MAX_NODES; j++) 1385 lat_corrected[i][j] = 0; 1386 1387 /* 1388 * For every two nodes, see whether there is another pair of nodes which 1389 * are about the same distance apart and make the latencies be the same 1390 * if they are close enough together 1391 */ 1392 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1393 if (!node_memory[i].exists) 1394 continue; 1395 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1396 if (!node_memory[j].exists) 1397 continue; 1398 /* 1399 * Pick one pair of nodes (i, j) 1400 * and get latency between them 1401 */ 1402 t1 = lat_stats->latencies[i][j]; 1403 1404 /* 1405 * Skip this pair of nodes if there isn't a latency 1406 * for it yet 1407 */ 1408 if (t1 == 0) 1409 continue; 1410 1411 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1412 if (!node_memory[k].exists) 1413 continue; 1414 for (l = 0; l < lgrp_plat_node_cnt; l++) { 1415 if (!node_memory[l].exists) 1416 continue; 1417 /* 1418 * Pick another pair of nodes (k, l) 1419 * not same as (i, j) and get latency 1420 * between them 1421 */ 1422 if (k == i && l == j) 1423 continue; 1424 1425 t2 = lat_stats->latencies[k][l]; 1426 1427 /* 1428 * Skip this pair of nodes if there 1429 * isn't a latency for it yet 1430 */ 1431 1432 if (t2 == 0) 1433 continue; 1434 1435 /* 1436 * Skip nodes (k, l) if they already 1437 * have same latency as (i, j) or 1438 * their latency isn't close enough to 1439 * be considered/made the same 1440 */ 1441 if (t1 == t2 || (t1 > t2 && t1 - t2 > 1442 t1 >> lgrp_plat_probe_lt_shift) || 1443 (t2 > t1 && t2 - t1 > 1444 t2 >> lgrp_plat_probe_lt_shift)) 1445 continue; 1446 1447 /* 1448 * Make latency(i, j) same as 1449 * latency(k, l), try to use latency 1450 * that has been adjusted already to get 1451 * more consistency (if possible), and 1452 * remember which latencies were 1453 * adjusted for next time 1454 */ 1455 if (lat_corrected[i][j]) { 1456 t = t1; 1457 lgrp_config(cflag, t2, t); 1458 t2 = t; 1459 } else if (lat_corrected[k][l]) { 1460 t = t2; 1461 lgrp_config(cflag, t1, t); 1462 t1 = t; 1463 } else { 1464 if (t1 > t2) 1465 t = t2; 1466 else 1467 t = t1; 1468 lgrp_config(cflag, t1, t); 1469 lgrp_config(cflag, t2, t); 1470 t1 = t2 = t; 1471 } 1472 1473 lat_stats->latencies[i][j] = 1474 lat_stats->latencies[k][l] = t; 1475 1476 lat_corrected[i][j] = 1477 lat_corrected[k][l] = 1; 1478 } 1479 } 1480 } 1481 } 1482 1483 /* 1484 * Local latencies should be same 1485 * - Find min and max local latencies 1486 * - Make all local latencies be minimum 1487 */ 1488 min = -1; 1489 max = 0; 1490 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1491 if (!node_memory[i].exists) 1492 continue; 1493 t = lat_stats->latencies[i][i]; 1494 if (t == 0) 1495 continue; 1496 if (min == -1 || t < min) 1497 min = t; 1498 if (t > max) 1499 max = t; 1500 } 1501 if (min != max) { 1502 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1503 int local; 1504 1505 if (!node_memory[i].exists) 1506 continue; 1507 1508 local = lat_stats->latencies[i][i]; 1509 if (local == 0) 1510 continue; 1511 1512 /* 1513 * Track suspect probe times that aren't within 1514 * tolerance of minimum local latency and how much 1515 * probe times are corrected by 1516 */ 1517 if (local - min > min >> lgrp_plat_probe_lt_shift) 1518 probe_stats->probe_suspect[i][i]++; 1519 1520 probe_stats->probe_errors[i][i] += local - min; 1521 1522 /* 1523 * Make local latencies be minimum 1524 */ 1525 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1526 lat_stats->latencies[i][i] = min; 1527 } 1528 } 1529 1530 /* 1531 * Determine max probe time again since just adjusted latencies 1532 */ 1533 lat_stats->latency_max = 0; 1534 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1535 if (!node_memory[i].exists) 1536 continue; 1537 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1538 if (!node_memory[j].exists) 1539 continue; 1540 t = lat_stats->latencies[i][j]; 1541 if (t > lat_stats->latency_max) 1542 lat_stats->latency_max = t; 1543 } 1544 } 1545 } 1546 1547 1548 /* 1549 * Verify following about latencies between nodes: 1550 * 1551 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1552 * - Local latencies same 1553 * - Local < remote 1554 * - Number of latencies seen is reasonable 1555 * - Number of occurrences of a given latency should be more than 1 1556 * 1557 * Returns: 1558 * 0 Success 1559 * -1 Not symmetric 1560 * -2 Local latencies not same 1561 * -3 Local >= remote 1562 */ 1563 static int 1564 lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 1565 lgrp_plat_latency_stats_t *lat_stats) 1566 { 1567 int i; 1568 int j; 1569 u_longlong_t t1; 1570 u_longlong_t t2; 1571 1572 ASSERT(node_memory != NULL && lat_stats != NULL); 1573 1574 /* 1575 * Nothing to do when this is an UMA machine, lgroup topology is 1576 * limited to 2 levels, or there aren't any probe times yet 1577 */ 1578 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1579 lat_stats->latencies[0][0] == 0) 1580 return (0); 1581 1582 /* 1583 * Make sure that latencies are symmetric between any two nodes 1584 * (ie. latency(node0, node1) == latency(node1, node0)) 1585 */ 1586 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1587 if (!node_memory[i].exists) 1588 continue; 1589 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1590 if (!node_memory[j].exists) 1591 continue; 1592 t1 = lat_stats->latencies[i][j]; 1593 t2 = lat_stats->latencies[j][i]; 1594 1595 if (t1 == 0 || t2 == 0 || t1 == t2) 1596 continue; 1597 1598 return (-1); 1599 } 1600 } 1601 1602 /* 1603 * Local latencies should be same 1604 */ 1605 t1 = lat_stats->latencies[0][0]; 1606 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1607 if (!node_memory[i].exists) 1608 continue; 1609 1610 t2 = lat_stats->latencies[i][i]; 1611 if (t2 == 0) 1612 continue; 1613 1614 if (t1 == 0) { 1615 t1 = t2; 1616 continue; 1617 } 1618 1619 if (t1 != t2) 1620 return (-2); 1621 } 1622 1623 /* 1624 * Local latencies should be less than remote 1625 */ 1626 if (t1) { 1627 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1628 if (!node_memory[i].exists) 1629 continue; 1630 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1631 if (!node_memory[j].exists) 1632 continue; 1633 t2 = lat_stats->latencies[i][j]; 1634 if (i == j || t2 == 0) 1635 continue; 1636 1637 if (t1 >= t2) 1638 return (-3); 1639 } 1640 } 1641 } 1642 1643 return (0); 1644 } 1645 1646 1647 /* 1648 * Return the number of free, allocatable, or installed 1649 * pages in an lgroup 1650 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1651 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1652 */ 1653 /* ARGSUSED */ 1654 static pgcnt_t 1655 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1656 { 1657 struct memlist *mlist; 1658 pgcnt_t npgs = 0; 1659 extern struct memlist *phys_avail; 1660 extern struct memlist *phys_install; 1661 1662 switch (query) { 1663 case LGRP_MEM_SIZE_FREE: 1664 return ((pgcnt_t)freemem); 1665 case LGRP_MEM_SIZE_AVAIL: 1666 memlist_read_lock(); 1667 for (mlist = phys_avail; mlist; mlist = mlist->next) 1668 npgs += btop(mlist->size); 1669 memlist_read_unlock(); 1670 return (npgs); 1671 case LGRP_MEM_SIZE_INSTALL: 1672 memlist_read_lock(); 1673 for (mlist = phys_install; mlist; mlist = mlist->next) 1674 npgs += btop(mlist->size); 1675 memlist_read_unlock(); 1676 return (npgs); 1677 default: 1678 return ((pgcnt_t)0); 1679 } 1680 } 1681 1682 1683 /* 1684 * Update node to proximity domain mappings for given domain and return node ID 1685 */ 1686 static int 1687 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, uint32_t domain) 1688 { 1689 uint_t node; 1690 uint_t start; 1691 1692 /* 1693 * Hash proximity domain ID into node to domain mapping table (array) 1694 * and add entry for it into first non-existent or matching entry found 1695 */ 1696 node = start = NODE_DOMAIN_HASH(domain); 1697 do { 1698 /* 1699 * Entry doesn't exist yet, so create one for this proximity 1700 * domain and return node ID which is index into mapping table. 1701 */ 1702 if (!node_domain[node].exists) { 1703 node_domain[node].exists = 1; 1704 node_domain[node].prox_domain = domain; 1705 return (node); 1706 } 1707 1708 /* 1709 * Entry exists for this proximity domain already, so just 1710 * return node ID (index into table). 1711 */ 1712 if (node_domain[node].prox_domain == domain) 1713 return (node); 1714 node = NODE_DOMAIN_HASH(node + 1); 1715 } while (node != start); 1716 1717 /* 1718 * Ran out of supported number of entries which shouldn't happen.... 1719 */ 1720 ASSERT(node != start); 1721 return (-1); 1722 } 1723 1724 1725 /* 1726 * Update node memory information for given proximity domain with specified 1727 * starting and ending physical address range (and return positive numbers for 1728 * success and negative ones for errors) 1729 */ 1730 static int 1731 lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 1732 node_phys_addr_map_t *node_memory, uintptr_t start, uintptr_t end, 1733 uint32_t domain) 1734 { 1735 int node; 1736 1737 /* 1738 * Get node number for proximity domain 1739 */ 1740 node = lgrp_plat_domain_to_node(node_domain, domain); 1741 if (node == -1) { 1742 node = lgrp_plat_node_domain_update(node_domain, domain); 1743 if (node == -1) 1744 return (-1); 1745 } 1746 1747 /* 1748 * Create entry in table for node if it doesn't exist 1749 */ 1750 if (!node_memory[node].exists) { 1751 node_memory[node].exists = 1; 1752 node_memory[node].start = btop(start); 1753 node_memory[node].end = btop(end); 1754 node_memory[node].prox_domain = domain; 1755 return (0); 1756 } 1757 1758 /* 1759 * Entry already exists for this proximity domain 1760 * 1761 * There may be more than one SRAT memory entry for a domain, so we may 1762 * need to update existing start or end address for the node. 1763 */ 1764 if (node_memory[node].prox_domain == domain) { 1765 if (btop(start) < node_memory[node].start) 1766 node_memory[node].start = btop(start); 1767 if (btop(end) > node_memory[node].end) 1768 node_memory[node].end = btop(end); 1769 return (1); 1770 } 1771 return (-2); 1772 } 1773 1774 1775 /* 1776 * Return time needed to probe from current CPU to memory in given node 1777 */ 1778 static hrtime_t 1779 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 1780 lgrp_plat_probe_mem_config_t *probe_mem_config, 1781 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1782 { 1783 caddr_t buf; 1784 hrtime_t elapsed; 1785 hrtime_t end; 1786 int from; 1787 int i; 1788 int ipl; 1789 hrtime_t max; 1790 hrtime_t min; 1791 hrtime_t start; 1792 extern int use_sse_pagecopy; 1793 1794 /* 1795 * Determine ID of node containing current CPU 1796 */ 1797 from = lgrp_plat_cpu_to_node(CPU, cpu_node); 1798 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1799 1800 /* 1801 * Do common work for probing main memory 1802 */ 1803 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 1804 /* 1805 * Skip probing any nodes without memory and 1806 * set probe time to 0 1807 */ 1808 if (probe_mem_config->probe_va[to] == NULL) { 1809 lat_stats->latencies[from][to] = 0; 1810 return (0); 1811 } 1812 1813 /* 1814 * Invalidate caches once instead of once every sample 1815 * which should cut cost of probing by a lot 1816 */ 1817 probe_stats->flush_cost = gethrtime(); 1818 invalidate_cache(); 1819 probe_stats->flush_cost = gethrtime() - 1820 probe_stats->flush_cost; 1821 probe_stats->probe_cost_total += probe_stats->flush_cost; 1822 } 1823 1824 /* 1825 * Probe from current CPU to given memory using specified operation 1826 * and take specified number of samples 1827 */ 1828 max = 0; 1829 min = -1; 1830 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1831 probe_stats->probe_cost = gethrtime(); 1832 1833 /* 1834 * Can't measure probe time if gethrtime() isn't working yet 1835 */ 1836 if (probe_stats->probe_cost == 0 && gethrtime() == 0) 1837 return (0); 1838 1839 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 1840 /* 1841 * Measure how long it takes to read vendor ID from 1842 * Northbridge 1843 */ 1844 elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 1845 } else { 1846 /* 1847 * Measure how long it takes to copy page 1848 * on top of itself 1849 */ 1850 buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 1851 1852 kpreempt_disable(); 1853 ipl = splhigh(); 1854 start = gethrtime(); 1855 if (use_sse_pagecopy) 1856 hwblkpagecopy(buf, buf); 1857 else 1858 bcopy(buf, buf, PAGESIZE); 1859 end = gethrtime(); 1860 elapsed = end - start; 1861 splx(ipl); 1862 kpreempt_enable(); 1863 } 1864 1865 probe_stats->probe_cost = gethrtime() - 1866 probe_stats->probe_cost; 1867 probe_stats->probe_cost_total += probe_stats->probe_cost; 1868 1869 if (min == -1 || elapsed < min) 1870 min = elapsed; 1871 if (elapsed > max) 1872 max = elapsed; 1873 } 1874 1875 /* 1876 * Update minimum and maximum probe times between 1877 * these two nodes 1878 */ 1879 if (min < probe_stats->probe_min[from][to] || 1880 probe_stats->probe_min[from][to] == 0) 1881 probe_stats->probe_min[from][to] = min; 1882 1883 if (max > probe_stats->probe_max[from][to]) 1884 probe_stats->probe_max[from][to] = max; 1885 1886 return (min); 1887 } 1888 1889 1890 /* 1891 * Read ACPI System Locality Information Table (SLIT) to determine how far each 1892 * NUMA node is from each other 1893 */ 1894 static int 1895 lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 1896 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats) 1897 { 1898 int i; 1899 int j; 1900 int localities; 1901 hrtime_t max; 1902 hrtime_t min; 1903 int retval; 1904 uint8_t *slit_entries; 1905 1906 if (tp == NULL || !lgrp_plat_slit_enable) 1907 return (1); 1908 1909 if (lat_stats == NULL) 1910 return (2); 1911 1912 localities = tp->number; 1913 if (localities != node_cnt) 1914 return (3); 1915 1916 min = lat_stats->latency_min; 1917 max = lat_stats->latency_max; 1918 1919 /* 1920 * Fill in latency matrix based on SLIT entries 1921 */ 1922 slit_entries = tp->entry; 1923 for (i = 0; i < localities; i++) { 1924 for (j = 0; j < localities; j++) { 1925 uint8_t latency; 1926 1927 latency = slit_entries[(i * localities) + j]; 1928 lat_stats->latencies[i][j] = latency; 1929 if (latency < min) 1930 min = latency; 1931 if (latency > max) 1932 max = latency; 1933 } 1934 } 1935 1936 /* 1937 * Verify that latencies/distances given in SLIT look reasonable 1938 */ 1939 retval = lgrp_plat_latency_verify(node_memory, lat_stats); 1940 1941 if (retval) { 1942 /* 1943 * Reinitialize (zero) latency table since SLIT doesn't look 1944 * right 1945 */ 1946 for (i = 0; i < localities; i++) { 1947 for (j = 0; j < localities; j++) 1948 lat_stats->latencies[i][j] = 0; 1949 } 1950 } else { 1951 /* 1952 * Update min and max latencies seen since SLIT looks valid 1953 */ 1954 lat_stats->latency_min = min; 1955 lat_stats->latency_max = max; 1956 } 1957 1958 return (retval); 1959 } 1960 1961 1962 /* 1963 * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 1964 * and memory are local to each other in the same NUMA node 1965 */ 1966 static int 1967 lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt, 1968 node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, 1969 node_phys_addr_map_t *node_memory) 1970 { 1971 struct srat_item *end; 1972 int i; 1973 struct srat_item *item; 1974 1975 if (tp == NULL || !lgrp_plat_srat_enable) 1976 return (1); 1977 1978 /* 1979 * Determine number of nodes by counting number of proximity domains in 1980 * SRAT 1981 */ 1982 if (node_cnt) { 1983 int nodes; 1984 1985 nodes = lgrp_plat_srat_domains(tp); 1986 if (nodes < 0) { 1987 *node_cnt = 1; 1988 return (2); 1989 } 1990 *node_cnt = nodes; 1991 } 1992 1993 /* 1994 * Walk through SRAT, examining each CPU and memory entry to determine 1995 * which CPUs and memory belong to which node. 1996 */ 1997 item = tp->list; 1998 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 1999 while (item < end) { 2000 uint32_t apic_id; 2001 uint32_t domain; 2002 uint64_t end; 2003 uint64_t length; 2004 uint64_t start; 2005 2006 switch (item->type) { 2007 case SRAT_PROCESSOR: /* CPU entry */ 2008 if (!(item->i.p.flags & SRAT_ENABLED) || 2009 cpu_node == NULL) 2010 break; 2011 2012 /* 2013 * Calculate domain (node) ID and fill in APIC ID to 2014 * domain/node mapping table 2015 */ 2016 domain = item->i.p.domain1; 2017 for (i = 0; i < 3; i++) { 2018 domain += item->i.p.domain2[i] << 2019 ((i + 1) * 8); 2020 } 2021 apic_id = item->i.p.apic_id; 2022 2023 if (lgrp_plat_cpu_node_update(node_domain, cpu_node, 2024 apic_id, domain) < 0) 2025 return (3); 2026 break; 2027 2028 case SRAT_MEMORY: /* memory entry */ 2029 if (!(item->i.m.flags & SRAT_ENABLED) || 2030 node_memory == NULL) 2031 break; 2032 2033 /* 2034 * Get domain (node) ID and fill in domain/node 2035 * to memory mapping table 2036 */ 2037 domain = item->i.m.domain; 2038 start = item->i.m.base_addr; 2039 length = item->i.m.len; 2040 end = start + length - 1; 2041 2042 if (lgrp_plat_node_memory_update(node_domain, 2043 node_memory, start, end, domain) < 0) 2044 return (4); 2045 break; 2046 2047 default: 2048 break; 2049 } 2050 2051 item = (struct srat_item *)((uintptr_t)item + item->len); 2052 } 2053 return (0); 2054 } 2055 2056 2057 /* 2058 * Return number of proximity domains given in ACPI SRAT 2059 */ 2060 static int 2061 lgrp_plat_srat_domains(struct srat *tp) 2062 { 2063 int domain_cnt; 2064 struct srat_item *end; 2065 int i; 2066 struct srat_item *item; 2067 node_domain_map_t node_domain[MAX_NODES]; 2068 2069 2070 if (tp == NULL || !lgrp_plat_srat_enable) 2071 return (1); 2072 2073 /* 2074 * Walk through SRAT, examining each CPU and memory entry to determine 2075 * proximity domain ID for each. 2076 */ 2077 domain_cnt = 0; 2078 item = tp->list; 2079 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2080 bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 2081 while (item < end) { 2082 uint32_t domain; 2083 boolean_t overflow; 2084 uint_t start; 2085 2086 switch (item->type) { 2087 case SRAT_PROCESSOR: /* CPU entry */ 2088 if (!(item->i.p.flags & SRAT_ENABLED)) 2089 break; 2090 domain = item->i.p.domain1; 2091 for (i = 0; i < 3; i++) { 2092 domain += item->i.p.domain2[i] << 2093 ((i + 1) * 8); 2094 } 2095 break; 2096 2097 case SRAT_MEMORY: /* memory entry */ 2098 if (!(item->i.m.flags & SRAT_ENABLED)) 2099 break; 2100 domain = item->i.m.domain; 2101 break; 2102 2103 default: 2104 break; 2105 } 2106 2107 /* 2108 * Count and keep track of which proximity domain IDs seen 2109 */ 2110 start = i = domain % MAX_NODES; 2111 overflow = B_TRUE; 2112 do { 2113 /* 2114 * Create entry for proximity domain and increment 2115 * count when no entry exists where proximity domain 2116 * hashed 2117 */ 2118 if (!node_domain[i].exists) { 2119 node_domain[i].exists = 1; 2120 node_domain[i].prox_domain = domain; 2121 domain_cnt++; 2122 overflow = B_FALSE; 2123 break; 2124 } 2125 2126 /* 2127 * Nothing to do when proximity domain seen already 2128 * and its entry exists 2129 */ 2130 if (node_domain[i].prox_domain == domain) { 2131 overflow = B_FALSE; 2132 break; 2133 } 2134 2135 /* 2136 * Entry exists where proximity domain hashed, but for 2137 * different proximity domain so keep search for empty 2138 * slot to put it or matching entry whichever comes 2139 * first. 2140 */ 2141 i = (i + 1) % MAX_NODES; 2142 } while (i != start); 2143 2144 /* 2145 * Didn't find empty or matching entry which means have more 2146 * proximity domains than supported nodes (:-( 2147 */ 2148 ASSERT(overflow != B_TRUE); 2149 if (overflow == B_TRUE) 2150 return (-1); 2151 2152 item = (struct srat_item *)((uintptr_t)item + item->len); 2153 } 2154 return (domain_cnt); 2155 } 2156 2157 2158 /* 2159 * Set lgroup latencies for 2 level lgroup topology 2160 */ 2161 static void 2162 lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 2163 lgrp_plat_latency_stats_t *lat_stats) 2164 { 2165 int i; 2166 2167 ASSERT(node_memory != NULL && lat_stats != NULL); 2168 2169 if (lgrp_plat_node_cnt >= 4) 2170 cmn_err(CE_NOTE, 2171 "MPO only optimizing for local and remote\n"); 2172 for (i = 0; i < lgrp_plat_node_cnt; i++) { 2173 int j; 2174 2175 if (!node_memory[i].exists) 2176 continue; 2177 for (j = 0; j < lgrp_plat_node_cnt; j++) { 2178 if (!node_memory[j].exists) 2179 continue; 2180 if (i == j) 2181 lat_stats->latencies[i][j] = 2; 2182 else 2183 lat_stats->latencies[i][j] = 3; 2184 } 2185 } 2186 lat_stats->latency_min = 2; 2187 lat_stats->latency_max = 3; 2188 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 2189 } 2190 2191 2192 /* 2193 * The following Opteron specific constants, macros, types, and routines define 2194 * PCI configuration space registers and how to read them to determine the NUMA 2195 * configuration of *supported* Opteron processors. They provide the same 2196 * information that may be gotten from the ACPI System Resource Affinity Table 2197 * (SRAT) if it exists on the machine of interest. 2198 * 2199 * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 2200 * of interest describes all of these registers and their contents. The main 2201 * registers used by this code to determine the NUMA configuration of the 2202 * machine are the node ID register for the number of NUMA nodes and the DRAM 2203 * address map registers for the physical address range of each node. 2204 * 2205 * NOTE: The format and how to determine the NUMA configuration using PCI 2206 * config space registers may change or may not be supported in future 2207 * Opteron processor families. 2208 */ 2209 2210 /* 2211 * How many bits to shift Opteron DRAM Address Map base and limit registers 2212 * to get actual value 2213 */ 2214 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 2215 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 2216 2217 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 2218 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 2219 2220 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 2221 2222 /* 2223 * Macros to derive addresses from Opteron DRAM Address Map registers 2224 */ 2225 #define OPT_DRAMADDR_HI(reg) \ 2226 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 2227 OPT_DRAMADDR_HI_LSHIFT_ADDR) 2228 2229 #define OPT_DRAMADDR_LO(reg) \ 2230 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 2231 OPT_DRAMADDR_LO_LSHIFT_ADDR) 2232 2233 #define OPT_DRAMADDR(high, low) \ 2234 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 2235 2236 /* 2237 * Bit masks defining what's in Opteron DRAM Address Map base register 2238 */ 2239 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 2240 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 2241 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 2242 2243 /* 2244 * Bit masks defining what's in Opteron DRAM Address Map limit register 2245 */ 2246 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 2247 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 2248 2249 2250 /* 2251 * Opteron Node ID register in PCI configuration space contains 2252 * number of nodes in system, etc. for Opteron K8. The following 2253 * constants and macros define its contents, structure, and access. 2254 */ 2255 2256 /* 2257 * Bit masks defining what's in Opteron Node ID register 2258 */ 2259 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 2260 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 2261 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 2262 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 2263 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 2264 2265 /* 2266 * How many bits in Opteron Node ID register to shift right to get actual value 2267 */ 2268 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 2269 2270 /* 2271 * Macros to get values from Opteron Node ID register 2272 */ 2273 #define OPT_NODE_CNT(reg) \ 2274 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 2275 2276 /* 2277 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 2278 * "in/out" instructions 2279 * 2280 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 2281 * other uses should just do MMIO to access PCI ECS. 2282 * Must enable special bit in Northbridge Configuration Register on 2283 * Greyhound for extended CF8 space access to be able to access PCI ECS 2284 * using "in/out" instructions and restore special bit after done 2285 * accessing PCI ECS. 2286 */ 2287 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 2288 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 2289 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 2290 ((((reg) >> 8) & 0xf) << 24)) 2291 2292 /* 2293 * PCI configuration space registers accessed by specifying 2294 * a bus, device, function, and offset. The following constants 2295 * define the values needed to access Opteron K8 configuration 2296 * info to determine its node topology 2297 */ 2298 2299 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 2300 2301 /* 2302 * Opteron PCI configuration space register function values 2303 */ 2304 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 2305 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 2306 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 2307 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 2308 2309 /* 2310 * PCI Configuration Space register offsets 2311 */ 2312 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 2313 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 2314 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 2315 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 2316 2317 /* 2318 * Opteron PCI Configuration Space device IDs for nodes 2319 */ 2320 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 2321 2322 2323 /* 2324 * Opteron DRAM address map gives base and limit for physical memory in a node 2325 */ 2326 typedef struct opt_dram_addr_map { 2327 uint32_t base_hi; 2328 uint32_t base_lo; 2329 uint32_t limit_hi; 2330 uint32_t limit_lo; 2331 } opt_dram_addr_map_t; 2332 2333 2334 /* 2335 * Supported AMD processor families 2336 */ 2337 #define AMD_FAMILY_HAMMER 15 2338 #define AMD_FAMILY_GREYHOUND 16 2339 2340 /* 2341 * Whether to have is_opteron() return 1 even when processor isn't supported 2342 */ 2343 uint_t is_opteron_override = 0; 2344 2345 /* 2346 * AMD processor family for current CPU 2347 */ 2348 uint_t opt_family = 0; 2349 2350 2351 /* 2352 * Determine whether we're running on a supported AMD Opteron since reading 2353 * node count and DRAM address map registers may have different format or 2354 * may not be supported across processor families 2355 */ 2356 static int 2357 is_opteron(void) 2358 { 2359 2360 if (x86_vendor != X86_VENDOR_AMD) 2361 return (0); 2362 2363 opt_family = cpuid_getfamily(CPU); 2364 if (opt_family == AMD_FAMILY_HAMMER || 2365 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 2366 return (1); 2367 else 2368 return (0); 2369 } 2370 2371 2372 /* 2373 * Determine NUMA configuration for Opteron from registers that live in PCI 2374 * configuration space 2375 */ 2376 static void 2377 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 2378 node_phys_addr_map_t *node_memory) 2379 { 2380 uint_t bus; 2381 uint_t dev; 2382 struct opt_dram_addr_map dram_map[MAX_NODES]; 2383 uint_t node; 2384 uint_t node_info[MAX_NODES]; 2385 uint_t off_hi; 2386 uint_t off_lo; 2387 uint64_t nb_cfg_reg; 2388 2389 /* 2390 * Read configuration registers from PCI configuration space to 2391 * determine node information, which memory is in each node, etc. 2392 * 2393 * Write to PCI configuration space address register to specify 2394 * which configuration register to read and read/write PCI 2395 * configuration space data register to get/set contents 2396 */ 2397 bus = OPT_PCS_BUS_CONFIG; 2398 dev = OPT_PCS_DEV_NODE0; 2399 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 2400 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 2401 2402 /* 2403 * Read node ID register for node 0 to get node count 2404 */ 2405 node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 2406 OPT_PCS_OFF_NODEID); 2407 *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 2408 2409 /* 2410 * If number of nodes is more than maximum supported, then set node 2411 * count to 1 and treat system as UMA instead of NUMA. 2412 */ 2413 if (*node_cnt > MAX_NODES) { 2414 *node_cnt = 1; 2415 return; 2416 } 2417 2418 /* 2419 * For Greyhound, PCI Extended Configuration Space must be enabled to 2420 * read high DRAM address map base and limit registers 2421 */ 2422 if (opt_family == AMD_FAMILY_GREYHOUND) { 2423 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 2424 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2425 wrmsr(MSR_AMD_NB_CFG, 2426 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 2427 } 2428 2429 for (node = 0; node < *node_cnt; node++) { 2430 uint32_t base_hi; 2431 uint32_t base_lo; 2432 uint32_t limit_hi; 2433 uint32_t limit_lo; 2434 2435 /* 2436 * Read node ID register (except for node 0 which we just read) 2437 */ 2438 if (node > 0) { 2439 node_info[node] = pci_getl_func(bus, dev, 2440 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 2441 } 2442 2443 /* 2444 * Read DRAM base and limit registers which specify 2445 * physical memory range of each node 2446 */ 2447 if (opt_family != AMD_FAMILY_GREYHOUND) 2448 base_hi = 0; 2449 else { 2450 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2451 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2452 base_hi = dram_map[node].base_hi = 2453 inl(PCI_CONFDATA); 2454 } 2455 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 2456 OPT_PCS_FUNC_ADDRMAP, off_lo); 2457 2458 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 2459 mem_intrlv) 2460 *mem_intrlv = *mem_intrlv + 1; 2461 2462 off_hi += 4; /* high limit register offset */ 2463 if (opt_family != AMD_FAMILY_GREYHOUND) 2464 limit_hi = 0; 2465 else { 2466 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2467 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2468 limit_hi = dram_map[node].limit_hi = 2469 inl(PCI_CONFDATA); 2470 } 2471 2472 off_lo += 4; /* low limit register offset */ 2473 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 2474 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 2475 2476 /* 2477 * Increment device number to next node and register offsets 2478 * for DRAM base register of next node 2479 */ 2480 off_hi += 4; 2481 off_lo += 4; 2482 dev++; 2483 2484 /* 2485 * Both read and write enable bits must be enabled in DRAM 2486 * address map base register for physical memory to exist in 2487 * node 2488 */ 2489 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 2490 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 2491 /* 2492 * Mark node memory as non-existent and set start and 2493 * end addresses to be same in node_memory[] 2494 */ 2495 node_memory[node].exists = 0; 2496 node_memory[node].start = node_memory[node].end = 2497 (pfn_t)-1; 2498 continue; 2499 } 2500 2501 /* 2502 * Mark node memory as existing and remember physical address 2503 * range of each node for use later 2504 */ 2505 node_memory[node].exists = 1; 2506 2507 node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 2508 2509 node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 2510 OPT_DRAMADDR_LO_MASK_OFF); 2511 } 2512 2513 /* 2514 * Restore PCI Extended Configuration Space enable bit 2515 */ 2516 if (opt_family == AMD_FAMILY_GREYHOUND) { 2517 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2518 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 2519 } 2520 } 2521 2522 2523 /* 2524 * Return average amount of time to read vendor ID register on Northbridge 2525 * N times on specified destination node from current CPU 2526 */ 2527 static hrtime_t 2528 opt_probe_vendor(int dest_node, int nreads) 2529 { 2530 int cnt; 2531 uint_t dev; 2532 /* LINTED: set but not used in function */ 2533 volatile uint_t dev_vendor; 2534 hrtime_t elapsed; 2535 hrtime_t end; 2536 int ipl; 2537 hrtime_t start; 2538 2539 dev = OPT_PCS_DEV_NODE0 + dest_node; 2540 kpreempt_disable(); 2541 ipl = spl8(); 2542 outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 2543 OPT_PCS_OFF_VENDOR)); 2544 start = gethrtime(); 2545 for (cnt = 0; cnt < nreads; cnt++) 2546 dev_vendor = inl(PCI_CONFDATA); 2547 end = gethrtime(); 2548 elapsed = (end - start) / nreads; 2549 splx(ipl); 2550 kpreempt_enable(); 2551 return (elapsed); 2552 } 2553