1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 32 * ================================================================ 33 * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 34 * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 35 * one or more CPUs and some local memory. The CPUs in each node can access 36 * the memory in the other nodes but at a higher latency than accessing their 37 * local memory. Typically, a system with only one node has Uniform Memory 38 * Access (UMA), but it may be possible to have a one node system that has 39 * some global memory outside of the node which is higher latency. 40 * 41 * Module Description 42 * ------------------ 43 * This module provides a platform interface for determining which CPUs and 44 * which memory (and how much) are in a NUMA node and how far each node is from 45 * each other. The interface is used by the Virtual Memory (VM) system and the 46 * common lgroup framework. The VM system uses the plat_*() routines to fill 47 * in its memory node (memnode) array with the physical address range spanned 48 * by each NUMA node to know which memory belongs to which node, so it can 49 * build and manage a physical page free list for each NUMA node and allocate 50 * local memory from each node as needed. The common lgroup framework uses the 51 * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 52 * to each node (leaf lgroup) and how far each node is from each other, so it 53 * can build the latency (lgroup) topology for the machine in order to optimize 54 * for locality. Also, an lgroup platform handle instead of lgroups are used 55 * in the interface with this module, so this module shouldn't need to know 56 * anything about lgroups. Instead, it just needs to know which CPUs, memory, 57 * etc. are in each NUMA node, how far each node is from each other, and to use 58 * a unique lgroup platform handle to refer to each node through the interface. 59 * 60 * Determining NUMA Configuration 61 * ------------------------------ 62 * By default, this module will try to determine the NUMA configuration of the 63 * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 64 * Locality Information Table (SLIT). The SRAT contains info to tell which 65 * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 66 * is a matrix that gives the distance between each system locality (which is 67 * a NUMA node and should correspond to proximity domains in the SRAT). For 68 * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 69 * specification. 70 * 71 * If the SRAT doesn't exist on a system with AMD Opteron processors, we 72 * examine registers in PCI configuration space to determine how many nodes are 73 * in the system and which CPUs and memory are in each node. 74 * do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is not guaranteed to work or be compatible across all 78 * Opteron processor families. 79 * 80 * If the SLIT does not exist or look right, the kernel will probe to determine 81 * the distance between nodes as long as the NUMA CPU and memory configuration 82 * has been determined (see lgrp_plat_probe() for details). 83 * 84 * Data Structures 85 * --------------- 86 * The main data structures used by this code are the following: 87 * 88 * - lgrp_plat_cpu_node[] APIC ID to node ID mapping table 89 * indexed by hashed APIC ID (only used 90 * for SRAT) 91 * 92 * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 93 * different nodes indexed by node ID 94 * 95 * - lgrp_plat_node_cnt Number of NUMA nodes in system 96 * 97 * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 98 * table indexed by node ID (only used 99 * for SRAT) 100 * 101 * - lgrp_plat_node_memory[] Table with physical address range for 102 * each node indexed by node ID 103 * 104 * The code is implemented to make the following always be true: 105 * 106 * lgroup platform handle == node ID == memnode ID 107 * 108 * Moreover, it allows for the proximity domain ID to be equal to all of the 109 * above as long as the proximity domains IDs are numbered from 0 to <number of 110 * nodes - 1>. This is done by hashing each proximity domain ID into the range 111 * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 112 * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 113 * and be assigned node ID N. If the proximity domain IDs aren't numbered 114 * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 115 * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 116 * to node IDs. However, the proximity domain IDs may not map to the 117 * equivalent node ID since we want to keep the node IDs numbered from 0 to 118 * <number of nodes - 1> to minimize cost of searching and potentially space. 119 */ 120 121 122 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 123 #include <sys/cmn_err.h> 124 #include <sys/controlregs.h> 125 #include <sys/cpupart.h> 126 #include <sys/cpuvar.h> 127 #include <sys/lgrp.h> 128 #include <sys/machsystm.h> 129 #include <sys/memlist.h> 130 #include <sys/memnode.h> 131 #include <sys/mman.h> 132 #include <sys/pci_cfgspace.h> 133 #include <sys/pci_impl.h> 134 #include <sys/param.h> 135 #include <sys/pghw.h> 136 #include <sys/promif.h> /* for prom_printf() */ 137 #include <sys/sysmacros.h> 138 #include <sys/systm.h> 139 #include <sys/thread.h> 140 #include <sys/types.h> 141 #include <sys/var.h> 142 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 143 #include <vm/hat_i86.h> 144 #include <vm/seg_kmem.h> 145 #include <vm/vm_dep.h> 146 147 #include "acpi_fw.h" /* for SRAT and SLIT */ 148 149 150 #define MAX_NODES 8 151 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 152 153 /* 154 * Constants for configuring probing 155 */ 156 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 157 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 158 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 159 160 /* 161 * Flags for probing 162 */ 163 #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 164 #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 165 #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 166 167 /* 168 * Hash CPU APIC ID into CPU to node mapping table using max_ncpus 169 * to minimize span of entries used 170 */ 171 #define CPU_NODE_HASH(apicid) ((apicid) % max_ncpus) 172 173 /* 174 * Hash proximity domain ID into node to domain mapping table using to minimize 175 * span of entries used 176 */ 177 #define NODE_DOMAIN_HASH(domain) ((domain) % lgrp_plat_node_cnt) 178 179 180 /* 181 * CPU APIC ID to node ID mapping structure (only used with SRAT) 182 */ 183 typedef struct cpu_node_map { 184 int exists; 185 uint_t node; 186 uint32_t apicid; 187 uint32_t prox_domain; 188 } cpu_node_map_t; 189 190 /* 191 * Latency statistics 192 */ 193 typedef struct lgrp_plat_latency_stats { 194 hrtime_t latencies[MAX_NODES][MAX_NODES]; 195 hrtime_t latency_max; 196 hrtime_t latency_min; 197 } lgrp_plat_latency_stats_t; 198 199 /* 200 * Memory configuration for probing 201 */ 202 typedef struct lgrp_plat_probe_mem_config { 203 size_t probe_memsize; /* how much memory to probe per node */ 204 caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 205 pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 206 } lgrp_plat_probe_mem_config_t; 207 208 /* 209 * Statistics kept for probing 210 */ 211 typedef struct lgrp_plat_probe_stats { 212 hrtime_t flush_cost; 213 hrtime_t probe_cost; 214 hrtime_t probe_cost_total; 215 hrtime_t probe_error_code; 216 hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 217 int probe_suspect[MAX_NODES][MAX_NODES]; 218 hrtime_t probe_max[MAX_NODES][MAX_NODES]; 219 hrtime_t probe_min[MAX_NODES][MAX_NODES]; 220 } lgrp_plat_probe_stats_t; 221 222 /* 223 * Node to proximity domain ID mapping structure (only used with SRAT) 224 */ 225 typedef struct node_domain_map { 226 int exists; 227 uint32_t prox_domain; 228 } node_domain_map_t; 229 230 /* 231 * Node ID and starting and ending page for physical memory in node 232 */ 233 typedef struct node_phys_addr_map { 234 pfn_t start; 235 pfn_t end; 236 int exists; 237 uint32_t prox_domain; 238 } node_phys_addr_map_t; 239 240 241 /* 242 * CPU APIC ID to node ID mapping table (only used for SRAT) 243 */ 244 static cpu_node_map_t lgrp_plat_cpu_node[NCPU]; 245 246 /* 247 * Latency statistics 248 */ 249 lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 250 251 /* 252 * Whether memory is interleaved across nodes causing MPO to be disabled 253 */ 254 static int lgrp_plat_mem_intrlv = 0; 255 256 /* 257 * Node ID to proximity domain ID mapping table (only used for SRAT) 258 */ 259 static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 260 261 /* 262 * Physical address range for memory in each node 263 */ 264 static node_phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 265 266 /* 267 * Statistics gotten from probing 268 */ 269 static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 270 271 /* 272 * Memory configuration for probing 273 */ 274 static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 275 276 /* 277 * Error code from processing ACPI SRAT 278 */ 279 static int lgrp_plat_srat_error = 0; 280 281 /* 282 * Error code from processing ACPI SLIT 283 */ 284 static int lgrp_plat_slit_error = 0; 285 286 /* 287 * Allocate lgroup array statically 288 */ 289 static lgrp_t lgrp_space[NLGRP]; 290 static int nlgrps_alloc; 291 292 293 /* 294 * Number of nodes in system 295 */ 296 uint_t lgrp_plat_node_cnt = 1; 297 298 /* 299 * Configuration Parameters for Probing 300 * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 301 * operation, etc. 302 * - lgrp_plat_probe_nrounds How many rounds of probing to do 303 * - lgrp_plat_probe_nsamples Number of samples to take when probing each 304 * node 305 * - lgrp_plat_probe_nreads Number of times to read vendor ID from 306 * Northbridge for each probe 307 */ 308 uint_t lgrp_plat_probe_flags = 0; 309 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 310 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 311 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 312 313 /* 314 * Enable use of ACPI System Resource Affinity Table (SRAT) and System 315 * Locality Information Table (SLIT) 316 */ 317 int lgrp_plat_srat_enable = 1; 318 int lgrp_plat_slit_enable = 1; 319 320 /* 321 * Static array to hold lgroup statistics 322 */ 323 struct lgrp_stats lgrp_stats[NLGRP]; 324 325 326 /* 327 * Forward declarations of platform interface routines 328 */ 329 void plat_build_mem_nodes(struct memlist *list); 330 331 int plat_lgrphand_to_mem_node(lgrp_handle_t hand); 332 333 lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 334 335 int plat_mnode_xcheck(pfn_t pfncnt); 336 337 int plat_pfn_to_mem_node(pfn_t pfn); 338 339 /* 340 * Forward declarations of lgroup platform interface routines 341 */ 342 lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 343 344 void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 345 346 lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 347 348 void lgrp_plat_init(void); 349 350 int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 351 352 void lgrp_plat_main_init(void); 353 354 int lgrp_plat_max_lgrps(void); 355 356 pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 357 lgrp_mem_query_t query); 358 359 lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 360 361 void lgrp_plat_probe(void); 362 363 lgrp_handle_t lgrp_plat_root_hand(void); 364 365 366 /* 367 * Forward declarations of local routines 368 */ 369 static int is_opteron(void); 370 371 static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node); 372 373 static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 374 uint32_t domain); 375 376 static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 377 lgrp_plat_latency_stats_t *lat_stats, 378 lgrp_plat_probe_stats_t *probe_stats); 379 380 static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 381 lgrp_plat_latency_stats_t *lat_stats); 382 383 static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 384 385 static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 386 uint32_t domain); 387 388 static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 389 node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, 390 uint32_t domain); 391 392 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 393 lgrp_plat_probe_mem_config_t *probe_mem_config, 394 lgrp_plat_latency_stats_t *lat_stats, 395 lgrp_plat_probe_stats_t *probe_stats); 396 397 static int lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 398 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); 399 400 static int lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt, 401 node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, 402 node_phys_addr_map_t *node_memory); 403 404 static int lgrp_plat_srat_domains(struct srat *tp); 405 406 static void lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 407 lgrp_plat_latency_stats_t *lat_stats); 408 409 static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 410 node_phys_addr_map_t *node_memory); 411 412 static hrtime_t opt_probe_vendor(int dest_node, int nreads); 413 414 415 /* 416 * PLATFORM INTERFACE ROUTINES 417 */ 418 419 /* 420 * Configure memory nodes for machines with more than one node (ie NUMA) 421 */ 422 void 423 plat_build_mem_nodes(struct memlist *list) 424 { 425 pfn_t cur_start; /* start addr of subrange */ 426 pfn_t cur_end; /* end addr of subrange */ 427 pfn_t start; /* start addr of whole range */ 428 pfn_t end; /* end addr of whole range */ 429 430 /* 431 * Boot install lists are arranged <addr, len>, ... 432 */ 433 while (list) { 434 int node; 435 436 start = list->address >> PAGESHIFT; 437 end = (list->address + list->size - 1) >> PAGESHIFT; 438 439 if (start > physmax) { 440 list = list->next; 441 continue; 442 } 443 if (end > physmax) 444 end = physmax; 445 446 /* 447 * When there is only one memnode, just add memory to memnode 448 */ 449 if (max_mem_nodes == 1) { 450 mem_node_add_slice(start, end); 451 list = list->next; 452 continue; 453 } 454 455 /* 456 * mem_node_add_slice() expects to get a memory range that 457 * is within one memnode, so need to split any memory range 458 * that spans multiple memnodes into subranges that are each 459 * contained within one memnode when feeding them to 460 * mem_node_add_slice() 461 */ 462 cur_start = start; 463 do { 464 node = plat_pfn_to_mem_node(cur_start); 465 466 /* 467 * Panic if DRAM address map registers or SRAT say 468 * memory in node doesn't exist or address from 469 * boot installed memory list entry isn't in this node. 470 * This shouldn't happen and rest of code can't deal 471 * with this if it does. 472 */ 473 if (node < 0 || node >= lgrp_plat_node_cnt || 474 !lgrp_plat_node_memory[node].exists || 475 cur_start < lgrp_plat_node_memory[node].start || 476 cur_start > lgrp_plat_node_memory[node].end) { 477 cmn_err(CE_PANIC, "Don't know which memnode " 478 "to add installed memory address 0x%lx\n", 479 cur_start); 480 } 481 482 /* 483 * End of current subrange should not span memnodes 484 */ 485 cur_end = end; 486 if (lgrp_plat_node_memory[node].exists && 487 cur_end > lgrp_plat_node_memory[node].end) 488 cur_end = lgrp_plat_node_memory[node].end; 489 490 mem_node_add_slice(cur_start, cur_end); 491 492 /* 493 * Next subrange starts after end of current one 494 */ 495 cur_start = cur_end + 1; 496 } while (cur_end < end); 497 498 list = list->next; 499 } 500 mem_node_physalign = 0; 501 mem_node_pfn_shift = 0; 502 } 503 504 505 int 506 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 507 { 508 if (max_mem_nodes == 1) 509 return (0); 510 511 return ((int)hand); 512 } 513 514 515 /* 516 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 517 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 518 * a crossing is found and returns 0 otherwise. 519 */ 520 int 521 plat_mnode_xcheck(pfn_t pfncnt) 522 { 523 int node, prevnode = -1, basenode; 524 pfn_t ea, sa; 525 526 for (node = 0; node < lgrp_plat_node_cnt; node++) { 527 528 if (lgrp_plat_node_memory[node].exists == 0) 529 continue; 530 531 if (prevnode == -1) { 532 prevnode = node; 533 basenode = node; 534 continue; 535 } 536 537 /* assume x86 node pfn ranges are in increasing order */ 538 ASSERT(lgrp_plat_node_memory[node].start > 539 lgrp_plat_node_memory[prevnode].end); 540 541 /* 542 * continue if the starting address of node is not contiguous 543 * with the previous node. 544 */ 545 546 if (lgrp_plat_node_memory[node].start != 547 (lgrp_plat_node_memory[prevnode].end + 1)) { 548 basenode = node; 549 prevnode = node; 550 continue; 551 } 552 553 /* check if the starting address of node is pfncnt aligned */ 554 if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { 555 556 /* 557 * at this point, node starts at an unaligned boundary 558 * and is contiguous with the previous node(s) to 559 * basenode. Check if there is an aligned contiguous 560 * range of length pfncnt that crosses this boundary. 561 */ 562 563 sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, 564 pfncnt); 565 ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), 566 pfncnt); 567 568 ASSERT((ea - sa) == pfncnt); 569 if (sa >= lgrp_plat_node_memory[basenode].start && 570 ea <= (lgrp_plat_node_memory[node].end + 1)) 571 return (1); 572 } 573 prevnode = node; 574 } 575 return (0); 576 } 577 578 579 lgrp_handle_t 580 plat_mem_node_to_lgrphand(int mnode) 581 { 582 if (max_mem_nodes == 1) 583 return (LGRP_DEFAULT_HANDLE); 584 585 return ((lgrp_handle_t)mnode); 586 } 587 588 589 int 590 plat_pfn_to_mem_node(pfn_t pfn) 591 { 592 int node; 593 594 if (max_mem_nodes == 1) 595 return (0); 596 597 for (node = 0; node < lgrp_plat_node_cnt; node++) { 598 /* 599 * Skip nodes with no memory 600 */ 601 if (!lgrp_plat_node_memory[node].exists) 602 continue; 603 604 if (pfn >= lgrp_plat_node_memory[node].start && 605 pfn <= lgrp_plat_node_memory[node].end) 606 return (node); 607 } 608 609 /* 610 * Didn't find memnode where this PFN lives which should never happen 611 */ 612 ASSERT(node < lgrp_plat_node_cnt); 613 return (-1); 614 } 615 616 617 /* 618 * LGROUP PLATFORM INTERFACE ROUTINES 619 */ 620 621 /* 622 * Allocate additional space for an lgroup. 623 */ 624 /* ARGSUSED */ 625 lgrp_t * 626 lgrp_plat_alloc(lgrp_id_t lgrpid) 627 { 628 lgrp_t *lgrp; 629 630 lgrp = &lgrp_space[nlgrps_alloc++]; 631 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 632 return (NULL); 633 return (lgrp); 634 } 635 636 637 /* 638 * Platform handling for (re)configuration changes 639 */ 640 /* ARGSUSED */ 641 void 642 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 643 { 644 } 645 646 647 /* 648 * Return the platform handle for the lgroup containing the given CPU 649 */ 650 /* ARGSUSED */ 651 lgrp_handle_t 652 lgrp_plat_cpu_to_hand(processorid_t id) 653 { 654 lgrp_handle_t hand; 655 656 if (lgrp_plat_node_cnt == 1) 657 return (LGRP_DEFAULT_HANDLE); 658 659 hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 660 lgrp_plat_cpu_node); 661 662 ASSERT(hand != (lgrp_handle_t)-1); 663 if (hand == (lgrp_handle_t)-1) 664 return (LGRP_NULL_HANDLE); 665 666 return (hand); 667 } 668 669 670 /* 671 * Platform-specific initialization of lgroups 672 */ 673 void 674 lgrp_plat_init(void) 675 { 676 #if defined(__xpv) 677 /* 678 * XXPV For now, the hypervisor treats all memory equally. 679 */ 680 lgrp_plat_node_cnt = max_mem_nodes = 1; 681 #else /* __xpv */ 682 uint_t probe_op; 683 684 /* 685 * Initialize as a UMA machine 686 */ 687 if (lgrp_topo_ht_limit() == 1) { 688 lgrp_plat_node_cnt = max_mem_nodes = 1; 689 return; 690 } 691 692 /* 693 * Determine which CPUs and memory are local to each other and number 694 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 695 */ 696 lgrp_plat_srat_error = lgrp_plat_process_srat(srat_ptr, 697 &lgrp_plat_node_cnt, lgrp_plat_node_domain, lgrp_plat_cpu_node, 698 lgrp_plat_node_memory); 699 700 /* 701 * Try to use PCI config space registers on Opteron if SRAT doesn't 702 * exist or there is some error processing the SRAT 703 */ 704 if (lgrp_plat_srat_error != 0 && is_opteron()) 705 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 706 lgrp_plat_node_memory); 707 708 /* 709 * Don't bother to setup system for multiple lgroups and only use one 710 * memory node when memory is interleaved between any nodes or there is 711 * only one NUMA node 712 * 713 * NOTE: May need to change this for Dynamic Reconfiguration (DR) 714 * when and if it happens for x86/x64 715 */ 716 if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 717 lgrp_plat_node_cnt = max_mem_nodes = 1; 718 (void) lgrp_topo_ht_limit_set(1); 719 return; 720 } 721 722 /* 723 * Leaf lgroups on x86/x64 architectures contain one physical 724 * processor chip. Tune lgrp_expand_proc_thresh and 725 * lgrp_expand_proc_diff so that lgrp_choose() will spread 726 * things out aggressively. 727 */ 728 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 729 lgrp_expand_proc_diff = 0; 730 731 /* 732 * There should be one memnode (physical page free list(s)) for 733 * each node 734 */ 735 max_mem_nodes = lgrp_plat_node_cnt; 736 737 /* 738 * Initialize min and max latency before reading SLIT or probing 739 */ 740 lgrp_plat_lat_stats.latency_min = -1; 741 lgrp_plat_lat_stats.latency_max = 0; 742 743 /* 744 * Determine how far each NUMA node is from each other by 745 * reading ACPI System Locality Information Table (SLIT) if it 746 * exists 747 */ 748 lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 749 lgrp_plat_node_cnt, lgrp_plat_node_memory, 750 &lgrp_plat_lat_stats); 751 if (lgrp_plat_slit_error == 0) 752 return; 753 754 /* 755 * Probe to determine latency between NUMA nodes when SLIT 756 * doesn't exist or make sense 757 */ 758 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 759 760 /* 761 * Specify whether to probe using vendor ID register or page copy 762 * if hasn't been specified already or is overspecified 763 */ 764 probe_op = lgrp_plat_probe_flags & 765 (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 766 767 if (probe_op == 0 || 768 probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 769 lgrp_plat_probe_flags &= 770 ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 771 if (is_opteron()) 772 lgrp_plat_probe_flags |= 773 LGRP_PLAT_PROBE_VENDOR; 774 else 775 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 776 } 777 778 /* 779 * Probing errors can mess up the lgroup topology and 780 * force us fall back to a 2 level lgroup topology. 781 * Here we bound how tall the lgroup topology can grow 782 * in hopes of avoiding any anamolies in probing from 783 * messing up the lgroup topology by limiting the 784 * accuracy of the latency topology. 785 * 786 * Assume that nodes will at least be configured in a 787 * ring, so limit height of lgroup topology to be less 788 * than number of nodes on a system with 4 or more 789 * nodes 790 */ 791 if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 792 lgrp_topo_ht_limit_default()) 793 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 794 #endif /* __xpv */ 795 } 796 797 798 /* 799 * Return latency between "from" and "to" lgroups 800 * 801 * This latency number can only be used for relative comparison 802 * between lgroups on the running system, cannot be used across platforms, 803 * and may not reflect the actual latency. It is platform and implementation 804 * specific, so platform gets to decide its value. It would be nice if the 805 * number was at least proportional to make comparisons more meaningful though. 806 */ 807 /* ARGSUSED */ 808 int 809 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 810 { 811 lgrp_handle_t src, dest; 812 int node; 813 814 if (max_mem_nodes == 1) 815 return (0); 816 817 /* 818 * Return max latency for root lgroup 819 */ 820 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 821 return (lgrp_plat_lat_stats.latency_max); 822 823 src = from; 824 dest = to; 825 826 /* 827 * Return 0 for nodes (lgroup platform handles) out of range 828 */ 829 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 830 return (0); 831 832 /* 833 * Probe from current CPU if its lgroup latencies haven't been set yet 834 * and we are trying to get latency from current CPU to some node 835 */ 836 node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 837 ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 838 if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) 839 lgrp_plat_probe(); 840 841 return (lgrp_plat_lat_stats.latencies[src][dest]); 842 } 843 844 845 /* 846 * Platform-specific initialization 847 */ 848 void 849 lgrp_plat_main_init(void) 850 { 851 int curnode; 852 int ht_limit; 853 int i; 854 855 /* 856 * Print a notice that MPO is disabled when memory is interleaved 857 * across nodes....Would do this when it is discovered, but can't 858 * because it happens way too early during boot.... 859 */ 860 if (lgrp_plat_mem_intrlv) 861 cmn_err(CE_NOTE, 862 "MPO disabled because memory is interleaved\n"); 863 864 /* 865 * Don't bother to do any probing if it is disabled, there is only one 866 * node, or the height of the lgroup topology less than or equal to 2 867 */ 868 ht_limit = lgrp_topo_ht_limit(); 869 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 870 max_mem_nodes == 1 || ht_limit <= 2) { 871 /* 872 * Setup lgroup latencies for 2 level lgroup topology 873 * (ie. local and remote only) if they haven't been set yet 874 */ 875 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 876 lgrp_plat_lat_stats.latency_max == 0) 877 lgrp_plat_2level_setup(lgrp_plat_node_memory, 878 &lgrp_plat_lat_stats); 879 return; 880 } 881 882 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 883 /* 884 * Should have been able to probe from CPU 0 when it was added 885 * to lgroup hierarchy, but may not have been able to then 886 * because it happens so early in boot that gethrtime() hasn't 887 * been initialized. (:-( 888 */ 889 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 890 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 891 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 892 lgrp_plat_probe(); 893 894 return; 895 } 896 897 /* 898 * When probing memory, use one page for every sample to determine 899 * lgroup topology and taking multiple samples 900 */ 901 if (lgrp_plat_probe_mem_config.probe_memsize == 0) 902 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 903 lgrp_plat_probe_nsamples; 904 905 /* 906 * Map memory in each node needed for probing to determine latency 907 * topology 908 */ 909 for (i = 0; i < lgrp_plat_node_cnt; i++) { 910 int mnode; 911 912 /* 913 * Skip this node and leave its probe page NULL 914 * if it doesn't have any memory 915 */ 916 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 917 if (!mem_node_config[mnode].exists) { 918 lgrp_plat_probe_mem_config.probe_va[i] = NULL; 919 continue; 920 } 921 922 /* 923 * Allocate one kernel virtual page 924 */ 925 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 926 lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 927 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 928 cmn_err(CE_WARN, 929 "lgrp_plat_main_init: couldn't allocate memory"); 930 return; 931 } 932 933 /* 934 * Get PFN for first page in each node 935 */ 936 lgrp_plat_probe_mem_config.probe_pfn[i] = 937 mem_node_config[mnode].physbase; 938 939 /* 940 * Map virtual page to first page in node 941 */ 942 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 943 lgrp_plat_probe_mem_config.probe_memsize, 944 lgrp_plat_probe_mem_config.probe_pfn[i], 945 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 946 HAT_LOAD_NOCONSIST); 947 } 948 949 /* 950 * Probe from current CPU 951 */ 952 lgrp_plat_probe(); 953 } 954 955 956 /* 957 * Return the maximum number of lgrps supported by the platform. 958 * Before lgrp topology is known it returns an estimate based on the number of 959 * nodes. Once topology is known it returns the actual maximim number of lgrps 960 * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and 961 * dynamic addition of new nodes, this number may not grow during system 962 * lifetime (yet). 963 */ 964 int 965 lgrp_plat_max_lgrps(void) 966 { 967 return (lgrp_topo_initialized ? 968 lgrp_alloc_max + 1 : 969 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 970 } 971 972 973 /* 974 * Return the number of free pages in an lgroup. 975 * 976 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 977 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 978 * number of allocatable base pagesize pages corresponding to the 979 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 980 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 981 * memory installed, regardless of whether or not it's usable. 982 */ 983 pgcnt_t 984 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 985 { 986 int mnode; 987 pgcnt_t npgs = (pgcnt_t)0; 988 extern struct memlist *phys_avail; 989 extern struct memlist *phys_install; 990 991 992 if (plathand == LGRP_DEFAULT_HANDLE) 993 return (lgrp_plat_mem_size_default(plathand, query)); 994 995 if (plathand != LGRP_NULL_HANDLE) { 996 mnode = plat_lgrphand_to_mem_node(plathand); 997 if (mnode >= 0 && mem_node_config[mnode].exists) { 998 switch (query) { 999 case LGRP_MEM_SIZE_FREE: 1000 npgs = MNODE_PGCNT(mnode); 1001 break; 1002 case LGRP_MEM_SIZE_AVAIL: 1003 npgs = mem_node_memlist_pages(mnode, 1004 phys_avail); 1005 break; 1006 case LGRP_MEM_SIZE_INSTALL: 1007 npgs = mem_node_memlist_pages(mnode, 1008 phys_install); 1009 break; 1010 default: 1011 break; 1012 } 1013 } 1014 } 1015 return (npgs); 1016 } 1017 1018 1019 /* 1020 * Return the platform handle of the lgroup that contains the physical memory 1021 * corresponding to the given page frame number 1022 */ 1023 /* ARGSUSED */ 1024 lgrp_handle_t 1025 lgrp_plat_pfn_to_hand(pfn_t pfn) 1026 { 1027 int mnode; 1028 1029 if (max_mem_nodes == 1) 1030 return (LGRP_DEFAULT_HANDLE); 1031 1032 if (pfn > physmax) 1033 return (LGRP_NULL_HANDLE); 1034 1035 mnode = plat_pfn_to_mem_node(pfn); 1036 if (mnode < 0) 1037 return (LGRP_NULL_HANDLE); 1038 1039 return (MEM_NODE_2_LGRPHAND(mnode)); 1040 } 1041 1042 1043 /* 1044 * Probe memory in each node from current CPU to determine latency topology 1045 * 1046 * The probing code will probe the vendor ID register on the Northbridge of 1047 * Opteron processors and probe memory for other processors by default. 1048 * 1049 * Since probing is inherently error prone, the code takes laps across all the 1050 * nodes probing from each node to each of the other nodes some number of 1051 * times. Furthermore, each node is probed some number of times before moving 1052 * onto the next one during each lap. The minimum latency gotten between nodes 1053 * is kept as the latency between the nodes. 1054 * 1055 * After all that, the probe times are adjusted by normalizing values that are 1056 * close to each other and local latencies are made the same. Lastly, the 1057 * latencies are verified to make sure that certain conditions are met (eg. 1058 * local < remote, latency(a, b) == latency(b, a), etc.). 1059 * 1060 * If any of the conditions aren't met, the code will export a NUMA 1061 * configuration with the local CPUs and memory given by the SRAT or PCI config 1062 * space registers and one remote memory latency since it can't tell exactly 1063 * how far each node is from each other. 1064 */ 1065 void 1066 lgrp_plat_probe(void) 1067 { 1068 int from; 1069 int i; 1070 lgrp_plat_latency_stats_t *lat_stats; 1071 hrtime_t probe_time; 1072 int to; 1073 1074 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1075 max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1076 return; 1077 1078 /* 1079 * Determine ID of node containing current CPU 1080 */ 1081 from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 1082 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1083 if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 1084 ASSERT(lgrp_plat_node_domain[from].exists); 1085 1086 /* 1087 * Don't need to probe if got times already 1088 */ 1089 lat_stats = &lgrp_plat_lat_stats; 1090 if (lat_stats->latencies[from][from] != 0) 1091 return; 1092 1093 /* 1094 * Read vendor ID in Northbridge or read and write page(s) 1095 * in each node from current CPU and remember how long it takes, 1096 * so we can build latency topology of machine later. 1097 * This should approximate the memory latency between each node. 1098 */ 1099 for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1100 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1101 /* 1102 * Get probe time and bail out if can't get it yet 1103 */ 1104 probe_time = lgrp_plat_probe_time(to, 1105 lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config, 1106 &lgrp_plat_lat_stats, &lgrp_plat_probe_stats); 1107 if (probe_time == 0) 1108 return; 1109 1110 /* 1111 * Keep lowest probe time as latency between nodes 1112 */ 1113 if (lat_stats->latencies[from][to] == 0 || 1114 probe_time < lat_stats->latencies[from][to]) 1115 lat_stats->latencies[from][to] = probe_time; 1116 1117 /* 1118 * Update overall minimum and maximum probe times 1119 * across all nodes 1120 */ 1121 if (probe_time < lat_stats->latency_min || 1122 lat_stats->latency_min == -1) 1123 lat_stats->latency_min = probe_time; 1124 if (probe_time > lat_stats->latency_max) 1125 lat_stats->latency_max = probe_time; 1126 } 1127 } 1128 1129 /* 1130 * - Fix up latencies such that local latencies are same, 1131 * latency(i, j) == latency(j, i), etc. (if possible) 1132 * 1133 * - Verify that latencies look ok 1134 * 1135 * - Fallback to just optimizing for local and remote if 1136 * latencies didn't look right 1137 */ 1138 lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats, 1139 &lgrp_plat_probe_stats); 1140 lgrp_plat_probe_stats.probe_error_code = 1141 lgrp_plat_latency_verify(lgrp_plat_node_memory, 1142 &lgrp_plat_lat_stats); 1143 if (lgrp_plat_probe_stats.probe_error_code) 1144 lgrp_plat_2level_setup(lgrp_plat_node_memory, 1145 &lgrp_plat_lat_stats); 1146 } 1147 1148 1149 /* 1150 * Return platform handle for root lgroup 1151 */ 1152 lgrp_handle_t 1153 lgrp_plat_root_hand(void) 1154 { 1155 return (LGRP_DEFAULT_HANDLE); 1156 } 1157 1158 1159 /* 1160 * INTERNAL ROUTINES 1161 */ 1162 1163 1164 /* 1165 * Update CPU to node mapping for given CPU and proximity domain (and returns 1166 * negative numbers for errors and positive ones for success) 1167 */ 1168 static int 1169 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 1170 cpu_node_map_t *cpu_node, uint32_t apicid, uint32_t domain) 1171 { 1172 uint_t i; 1173 uint_t start; 1174 int node; 1175 1176 /* 1177 * Get node number for proximity domain 1178 */ 1179 node = lgrp_plat_domain_to_node(node_domain, domain); 1180 if (node == -1) { 1181 node = lgrp_plat_node_domain_update(node_domain, domain); 1182 if (node == -1) 1183 return (-1); 1184 } 1185 1186 /* 1187 * Hash given CPU APIC ID into CPU to node mapping table/array and 1188 * enter it and its corresponding node and proximity domain IDs into 1189 * first non-existent or matching entry 1190 */ 1191 i = start = CPU_NODE_HASH(apicid); 1192 do { 1193 if (cpu_node[i].exists) { 1194 /* 1195 * Update already existing entry for CPU 1196 */ 1197 if (cpu_node[i].apicid == apicid) { 1198 /* 1199 * Just return when everything same 1200 */ 1201 if (cpu_node[i].prox_domain == domain && 1202 cpu_node[i].node == node) 1203 return (1); 1204 1205 /* 1206 * Assert that proximity domain and node IDs 1207 * should be same and return error on non-debug 1208 * kernel 1209 */ 1210 ASSERT(cpu_node[i].prox_domain == domain && 1211 cpu_node[i].node == node); 1212 return (-1); 1213 } 1214 } else { 1215 /* 1216 * Create new entry for CPU 1217 */ 1218 cpu_node[i].exists = 1; 1219 cpu_node[i].apicid = apicid; 1220 cpu_node[i].prox_domain = domain; 1221 cpu_node[i].node = node; 1222 return (0); 1223 } 1224 i = CPU_NODE_HASH(i + 1); 1225 } while (i != start); 1226 1227 /* 1228 * Ran out of supported number of entries which shouldn't happen.... 1229 */ 1230 ASSERT(i != start); 1231 return (-1); 1232 } 1233 1234 1235 /* 1236 * Get node ID for given CPU ID 1237 */ 1238 static int 1239 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node) 1240 { 1241 uint32_t apicid; 1242 uint_t i; 1243 uint_t start; 1244 1245 if (cp == NULL) 1246 return (-1); 1247 1248 /* 1249 * SRAT doesn't exist, isn't enabled, or there was an error processing 1250 * it, so return chip ID for Opteron and -1 otherwise. 1251 */ 1252 if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1253 lgrp_plat_srat_error) { 1254 if (is_opteron()) 1255 return (pg_plat_hw_instance_id(cp, PGHW_CHIP)); 1256 return (-1); 1257 } 1258 1259 /* 1260 * SRAT does exist, so get APIC ID for given CPU and map that to its 1261 * node ID 1262 */ 1263 apicid = cpuid_get_apicid(cp); 1264 i = start = CPU_NODE_HASH(apicid); 1265 do { 1266 if (cpu_node[i].apicid == apicid && cpu_node[i].exists) 1267 return (cpu_node[i].node); 1268 i = CPU_NODE_HASH(i + 1); 1269 } while (i != start); 1270 return (-1); 1271 } 1272 1273 1274 /* 1275 * Return node number for given proximity domain/system locality 1276 */ 1277 static int 1278 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, uint32_t domain) 1279 { 1280 uint_t node; 1281 uint_t start; 1282 1283 /* 1284 * Hash proximity domain ID into node to domain mapping table (array), 1285 * search for entry with matching proximity domain ID, and return index 1286 * of matching entry as node ID. 1287 */ 1288 node = start = NODE_DOMAIN_HASH(domain); 1289 do { 1290 if (node_domain[node].prox_domain == domain && 1291 node_domain[node].exists) 1292 return (node); 1293 node = NODE_DOMAIN_HASH(node + 1); 1294 } while (node != start); 1295 return (-1); 1296 } 1297 1298 1299 /* 1300 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1301 * be considered same 1302 */ 1303 #define LGRP_LAT_TOLERANCE_SHIFT 4 1304 1305 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1306 1307 1308 /* 1309 * Adjust latencies between nodes to be symmetric, normalize latencies between 1310 * any nodes that are within some tolerance to be same, and make local 1311 * latencies be same 1312 */ 1313 static void 1314 lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 1315 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1316 { 1317 int i; 1318 int j; 1319 int k; 1320 int l; 1321 u_longlong_t max; 1322 u_longlong_t min; 1323 u_longlong_t t; 1324 u_longlong_t t1; 1325 u_longlong_t t2; 1326 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1327 int lat_corrected[MAX_NODES][MAX_NODES]; 1328 1329 /* 1330 * Nothing to do when this is an UMA machine or don't have args needed 1331 */ 1332 if (max_mem_nodes == 1) 1333 return; 1334 1335 ASSERT(node_memory != NULL && lat_stats != NULL && 1336 probe_stats != NULL); 1337 1338 /* 1339 * Make sure that latencies are symmetric between any two nodes 1340 * (ie. latency(node0, node1) == latency(node1, node0)) 1341 */ 1342 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1343 if (!node_memory[i].exists) 1344 continue; 1345 1346 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1347 if (!node_memory[j].exists) 1348 continue; 1349 1350 t1 = lat_stats->latencies[i][j]; 1351 t2 = lat_stats->latencies[j][i]; 1352 1353 if (t1 == 0 || t2 == 0 || t1 == t2) 1354 continue; 1355 1356 /* 1357 * Latencies should be same 1358 * - Use minimum of two latencies which should be same 1359 * - Track suspect probe times not within tolerance of 1360 * min value 1361 * - Remember how much values are corrected by 1362 */ 1363 if (t1 > t2) { 1364 t = t2; 1365 probe_stats->probe_errors[i][j] += t1 - t2; 1366 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1367 probe_stats->probe_suspect[i][j]++; 1368 probe_stats->probe_suspect[j][i]++; 1369 } 1370 } else if (t2 > t1) { 1371 t = t1; 1372 probe_stats->probe_errors[j][i] += t2 - t1; 1373 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1374 probe_stats->probe_suspect[i][j]++; 1375 probe_stats->probe_suspect[j][i]++; 1376 } 1377 } 1378 1379 lat_stats->latencies[i][j] = 1380 lat_stats->latencies[j][i] = t; 1381 lgrp_config(cflag, t1, t); 1382 lgrp_config(cflag, t2, t); 1383 } 1384 } 1385 1386 /* 1387 * Keep track of which latencies get corrected 1388 */ 1389 for (i = 0; i < MAX_NODES; i++) 1390 for (j = 0; j < MAX_NODES; j++) 1391 lat_corrected[i][j] = 0; 1392 1393 /* 1394 * For every two nodes, see whether there is another pair of nodes which 1395 * are about the same distance apart and make the latencies be the same 1396 * if they are close enough together 1397 */ 1398 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1399 if (!node_memory[i].exists) 1400 continue; 1401 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1402 if (!node_memory[j].exists) 1403 continue; 1404 /* 1405 * Pick one pair of nodes (i, j) 1406 * and get latency between them 1407 */ 1408 t1 = lat_stats->latencies[i][j]; 1409 1410 /* 1411 * Skip this pair of nodes if there isn't a latency 1412 * for it yet 1413 */ 1414 if (t1 == 0) 1415 continue; 1416 1417 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1418 if (!node_memory[k].exists) 1419 continue; 1420 for (l = 0; l < lgrp_plat_node_cnt; l++) { 1421 if (!node_memory[l].exists) 1422 continue; 1423 /* 1424 * Pick another pair of nodes (k, l) 1425 * not same as (i, j) and get latency 1426 * between them 1427 */ 1428 if (k == i && l == j) 1429 continue; 1430 1431 t2 = lat_stats->latencies[k][l]; 1432 1433 /* 1434 * Skip this pair of nodes if there 1435 * isn't a latency for it yet 1436 */ 1437 1438 if (t2 == 0) 1439 continue; 1440 1441 /* 1442 * Skip nodes (k, l) if they already 1443 * have same latency as (i, j) or 1444 * their latency isn't close enough to 1445 * be considered/made the same 1446 */ 1447 if (t1 == t2 || (t1 > t2 && t1 - t2 > 1448 t1 >> lgrp_plat_probe_lt_shift) || 1449 (t2 > t1 && t2 - t1 > 1450 t2 >> lgrp_plat_probe_lt_shift)) 1451 continue; 1452 1453 /* 1454 * Make latency(i, j) same as 1455 * latency(k, l), try to use latency 1456 * that has been adjusted already to get 1457 * more consistency (if possible), and 1458 * remember which latencies were 1459 * adjusted for next time 1460 */ 1461 if (lat_corrected[i][j]) { 1462 t = t1; 1463 lgrp_config(cflag, t2, t); 1464 t2 = t; 1465 } else if (lat_corrected[k][l]) { 1466 t = t2; 1467 lgrp_config(cflag, t1, t); 1468 t1 = t; 1469 } else { 1470 if (t1 > t2) 1471 t = t2; 1472 else 1473 t = t1; 1474 lgrp_config(cflag, t1, t); 1475 lgrp_config(cflag, t2, t); 1476 t1 = t2 = t; 1477 } 1478 1479 lat_stats->latencies[i][j] = 1480 lat_stats->latencies[k][l] = t; 1481 1482 lat_corrected[i][j] = 1483 lat_corrected[k][l] = 1; 1484 } 1485 } 1486 } 1487 } 1488 1489 /* 1490 * Local latencies should be same 1491 * - Find min and max local latencies 1492 * - Make all local latencies be minimum 1493 */ 1494 min = -1; 1495 max = 0; 1496 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1497 if (!node_memory[i].exists) 1498 continue; 1499 t = lat_stats->latencies[i][i]; 1500 if (t == 0) 1501 continue; 1502 if (min == -1 || t < min) 1503 min = t; 1504 if (t > max) 1505 max = t; 1506 } 1507 if (min != max) { 1508 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1509 int local; 1510 1511 if (!node_memory[i].exists) 1512 continue; 1513 1514 local = lat_stats->latencies[i][i]; 1515 if (local == 0) 1516 continue; 1517 1518 /* 1519 * Track suspect probe times that aren't within 1520 * tolerance of minimum local latency and how much 1521 * probe times are corrected by 1522 */ 1523 if (local - min > min >> lgrp_plat_probe_lt_shift) 1524 probe_stats->probe_suspect[i][i]++; 1525 1526 probe_stats->probe_errors[i][i] += local - min; 1527 1528 /* 1529 * Make local latencies be minimum 1530 */ 1531 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1532 lat_stats->latencies[i][i] = min; 1533 } 1534 } 1535 1536 /* 1537 * Determine max probe time again since just adjusted latencies 1538 */ 1539 lat_stats->latency_max = 0; 1540 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1541 if (!node_memory[i].exists) 1542 continue; 1543 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1544 if (!node_memory[j].exists) 1545 continue; 1546 t = lat_stats->latencies[i][j]; 1547 if (t > lat_stats->latency_max) 1548 lat_stats->latency_max = t; 1549 } 1550 } 1551 } 1552 1553 1554 /* 1555 * Verify following about latencies between nodes: 1556 * 1557 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1558 * - Local latencies same 1559 * - Local < remote 1560 * - Number of latencies seen is reasonable 1561 * - Number of occurrences of a given latency should be more than 1 1562 * 1563 * Returns: 1564 * 0 Success 1565 * -1 Not symmetric 1566 * -2 Local latencies not same 1567 * -3 Local >= remote 1568 */ 1569 static int 1570 lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 1571 lgrp_plat_latency_stats_t *lat_stats) 1572 { 1573 int i; 1574 int j; 1575 u_longlong_t t1; 1576 u_longlong_t t2; 1577 1578 ASSERT(node_memory != NULL && lat_stats != NULL); 1579 1580 /* 1581 * Nothing to do when this is an UMA machine, lgroup topology is 1582 * limited to 2 levels, or there aren't any probe times yet 1583 */ 1584 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1585 lat_stats->latencies[0][0] == 0) 1586 return (0); 1587 1588 /* 1589 * Make sure that latencies are symmetric between any two nodes 1590 * (ie. latency(node0, node1) == latency(node1, node0)) 1591 */ 1592 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1593 if (!node_memory[i].exists) 1594 continue; 1595 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1596 if (!node_memory[j].exists) 1597 continue; 1598 t1 = lat_stats->latencies[i][j]; 1599 t2 = lat_stats->latencies[j][i]; 1600 1601 if (t1 == 0 || t2 == 0 || t1 == t2) 1602 continue; 1603 1604 return (-1); 1605 } 1606 } 1607 1608 /* 1609 * Local latencies should be same 1610 */ 1611 t1 = lat_stats->latencies[0][0]; 1612 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1613 if (!node_memory[i].exists) 1614 continue; 1615 1616 t2 = lat_stats->latencies[i][i]; 1617 if (t2 == 0) 1618 continue; 1619 1620 if (t1 == 0) { 1621 t1 = t2; 1622 continue; 1623 } 1624 1625 if (t1 != t2) 1626 return (-2); 1627 } 1628 1629 /* 1630 * Local latencies should be less than remote 1631 */ 1632 if (t1) { 1633 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1634 if (!node_memory[i].exists) 1635 continue; 1636 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1637 if (!node_memory[j].exists) 1638 continue; 1639 t2 = lat_stats->latencies[i][j]; 1640 if (i == j || t2 == 0) 1641 continue; 1642 1643 if (t1 >= t2) 1644 return (-3); 1645 } 1646 } 1647 } 1648 1649 return (0); 1650 } 1651 1652 1653 /* 1654 * Return the number of free, allocatable, or installed 1655 * pages in an lgroup 1656 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1657 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1658 */ 1659 /* ARGSUSED */ 1660 static pgcnt_t 1661 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1662 { 1663 struct memlist *mlist; 1664 pgcnt_t npgs = 0; 1665 extern struct memlist *phys_avail; 1666 extern struct memlist *phys_install; 1667 1668 switch (query) { 1669 case LGRP_MEM_SIZE_FREE: 1670 return ((pgcnt_t)freemem); 1671 case LGRP_MEM_SIZE_AVAIL: 1672 memlist_read_lock(); 1673 for (mlist = phys_avail; mlist; mlist = mlist->next) 1674 npgs += btop(mlist->size); 1675 memlist_read_unlock(); 1676 return (npgs); 1677 case LGRP_MEM_SIZE_INSTALL: 1678 memlist_read_lock(); 1679 for (mlist = phys_install; mlist; mlist = mlist->next) 1680 npgs += btop(mlist->size); 1681 memlist_read_unlock(); 1682 return (npgs); 1683 default: 1684 return ((pgcnt_t)0); 1685 } 1686 } 1687 1688 1689 /* 1690 * Update node to proximity domain mappings for given domain and return node ID 1691 */ 1692 static int 1693 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, uint32_t domain) 1694 { 1695 uint_t node; 1696 uint_t start; 1697 1698 /* 1699 * Hash proximity domain ID into node to domain mapping table (array) 1700 * and add entry for it into first non-existent or matching entry found 1701 */ 1702 node = start = NODE_DOMAIN_HASH(domain); 1703 do { 1704 /* 1705 * Entry doesn't exist yet, so create one for this proximity 1706 * domain and return node ID which is index into mapping table. 1707 */ 1708 if (!node_domain[node].exists) { 1709 node_domain[node].exists = 1; 1710 node_domain[node].prox_domain = domain; 1711 return (node); 1712 } 1713 1714 /* 1715 * Entry exists for this proximity domain already, so just 1716 * return node ID (index into table). 1717 */ 1718 if (node_domain[node].prox_domain == domain) 1719 return (node); 1720 node = NODE_DOMAIN_HASH(node + 1); 1721 } while (node != start); 1722 1723 /* 1724 * Ran out of supported number of entries which shouldn't happen.... 1725 */ 1726 ASSERT(node != start); 1727 return (-1); 1728 } 1729 1730 1731 /* 1732 * Update node memory information for given proximity domain with specified 1733 * starting and ending physical address range (and return positive numbers for 1734 * success and negative ones for errors) 1735 */ 1736 static int 1737 lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 1738 node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, 1739 uint32_t domain) 1740 { 1741 int node; 1742 1743 /* 1744 * Get node number for proximity domain 1745 */ 1746 node = lgrp_plat_domain_to_node(node_domain, domain); 1747 if (node == -1) { 1748 node = lgrp_plat_node_domain_update(node_domain, domain); 1749 if (node == -1) 1750 return (-1); 1751 } 1752 1753 /* 1754 * Create entry in table for node if it doesn't exist 1755 */ 1756 if (!node_memory[node].exists) { 1757 node_memory[node].exists = 1; 1758 node_memory[node].start = btop(start); 1759 node_memory[node].end = btop(end); 1760 node_memory[node].prox_domain = domain; 1761 return (0); 1762 } 1763 1764 /* 1765 * Entry already exists for this proximity domain 1766 * 1767 * There may be more than one SRAT memory entry for a domain, so we may 1768 * need to update existing start or end address for the node. 1769 */ 1770 if (node_memory[node].prox_domain == domain) { 1771 if (btop(start) < node_memory[node].start) 1772 node_memory[node].start = btop(start); 1773 if (btop(end) > node_memory[node].end) 1774 node_memory[node].end = btop(end); 1775 return (1); 1776 } 1777 return (-2); 1778 } 1779 1780 1781 /* 1782 * Return time needed to probe from current CPU to memory in given node 1783 */ 1784 static hrtime_t 1785 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 1786 lgrp_plat_probe_mem_config_t *probe_mem_config, 1787 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1788 { 1789 caddr_t buf; 1790 hrtime_t elapsed; 1791 hrtime_t end; 1792 int from; 1793 int i; 1794 int ipl; 1795 hrtime_t max; 1796 hrtime_t min; 1797 hrtime_t start; 1798 extern int use_sse_pagecopy; 1799 1800 /* 1801 * Determine ID of node containing current CPU 1802 */ 1803 from = lgrp_plat_cpu_to_node(CPU, cpu_node); 1804 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1805 1806 /* 1807 * Do common work for probing main memory 1808 */ 1809 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 1810 /* 1811 * Skip probing any nodes without memory and 1812 * set probe time to 0 1813 */ 1814 if (probe_mem_config->probe_va[to] == NULL) { 1815 lat_stats->latencies[from][to] = 0; 1816 return (0); 1817 } 1818 1819 /* 1820 * Invalidate caches once instead of once every sample 1821 * which should cut cost of probing by a lot 1822 */ 1823 probe_stats->flush_cost = gethrtime(); 1824 invalidate_cache(); 1825 probe_stats->flush_cost = gethrtime() - 1826 probe_stats->flush_cost; 1827 probe_stats->probe_cost_total += probe_stats->flush_cost; 1828 } 1829 1830 /* 1831 * Probe from current CPU to given memory using specified operation 1832 * and take specified number of samples 1833 */ 1834 max = 0; 1835 min = -1; 1836 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1837 probe_stats->probe_cost = gethrtime(); 1838 1839 /* 1840 * Can't measure probe time if gethrtime() isn't working yet 1841 */ 1842 if (probe_stats->probe_cost == 0 && gethrtime() == 0) 1843 return (0); 1844 1845 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 1846 /* 1847 * Measure how long it takes to read vendor ID from 1848 * Northbridge 1849 */ 1850 elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 1851 } else { 1852 /* 1853 * Measure how long it takes to copy page 1854 * on top of itself 1855 */ 1856 buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 1857 1858 kpreempt_disable(); 1859 ipl = splhigh(); 1860 start = gethrtime(); 1861 if (use_sse_pagecopy) 1862 hwblkpagecopy(buf, buf); 1863 else 1864 bcopy(buf, buf, PAGESIZE); 1865 end = gethrtime(); 1866 elapsed = end - start; 1867 splx(ipl); 1868 kpreempt_enable(); 1869 } 1870 1871 probe_stats->probe_cost = gethrtime() - 1872 probe_stats->probe_cost; 1873 probe_stats->probe_cost_total += probe_stats->probe_cost; 1874 1875 if (min == -1 || elapsed < min) 1876 min = elapsed; 1877 if (elapsed > max) 1878 max = elapsed; 1879 } 1880 1881 /* 1882 * Update minimum and maximum probe times between 1883 * these two nodes 1884 */ 1885 if (min < probe_stats->probe_min[from][to] || 1886 probe_stats->probe_min[from][to] == 0) 1887 probe_stats->probe_min[from][to] = min; 1888 1889 if (max > probe_stats->probe_max[from][to]) 1890 probe_stats->probe_max[from][to] = max; 1891 1892 return (min); 1893 } 1894 1895 1896 /* 1897 * Read ACPI System Locality Information Table (SLIT) to determine how far each 1898 * NUMA node is from each other 1899 */ 1900 static int 1901 lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 1902 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats) 1903 { 1904 int i; 1905 int j; 1906 int localities; 1907 hrtime_t max; 1908 hrtime_t min; 1909 int retval; 1910 uint8_t *slit_entries; 1911 1912 if (tp == NULL || !lgrp_plat_slit_enable) 1913 return (1); 1914 1915 if (lat_stats == NULL) 1916 return (2); 1917 1918 localities = tp->number; 1919 if (localities != node_cnt) 1920 return (3); 1921 1922 min = lat_stats->latency_min; 1923 max = lat_stats->latency_max; 1924 1925 /* 1926 * Fill in latency matrix based on SLIT entries 1927 */ 1928 slit_entries = tp->entry; 1929 for (i = 0; i < localities; i++) { 1930 for (j = 0; j < localities; j++) { 1931 uint8_t latency; 1932 1933 latency = slit_entries[(i * localities) + j]; 1934 lat_stats->latencies[i][j] = latency; 1935 if (latency < min || min == -1) 1936 min = latency; 1937 if (latency > max) 1938 max = latency; 1939 } 1940 } 1941 1942 /* 1943 * Verify that latencies/distances given in SLIT look reasonable 1944 */ 1945 retval = lgrp_plat_latency_verify(node_memory, lat_stats); 1946 1947 if (retval) { 1948 /* 1949 * Reinitialize (zero) latency table since SLIT doesn't look 1950 * right 1951 */ 1952 for (i = 0; i < localities; i++) { 1953 for (j = 0; j < localities; j++) 1954 lat_stats->latencies[i][j] = 0; 1955 } 1956 } else { 1957 /* 1958 * Update min and max latencies seen since SLIT looks valid 1959 */ 1960 lat_stats->latency_min = min; 1961 lat_stats->latency_max = max; 1962 } 1963 1964 return (retval); 1965 } 1966 1967 1968 /* 1969 * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 1970 * and memory are local to each other in the same NUMA node 1971 */ 1972 static int 1973 lgrp_plat_process_srat(struct srat *tp, uint_t *node_cnt, 1974 node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, 1975 node_phys_addr_map_t *node_memory) 1976 { 1977 struct srat_item *srat_end; 1978 int i; 1979 struct srat_item *item; 1980 1981 if (tp == NULL || !lgrp_plat_srat_enable) 1982 return (1); 1983 1984 /* 1985 * Determine number of nodes by counting number of proximity domains in 1986 * SRAT 1987 */ 1988 if (node_cnt) { 1989 int nodes; 1990 1991 nodes = lgrp_plat_srat_domains(tp); 1992 if (nodes < 0) { 1993 *node_cnt = 1; 1994 return (2); 1995 } 1996 *node_cnt = nodes; 1997 } 1998 1999 /* 2000 * Walk through SRAT, examining each CPU and memory entry to determine 2001 * which CPUs and memory belong to which node. 2002 */ 2003 item = tp->list; 2004 srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2005 while (item < srat_end) { 2006 uint32_t apic_id; 2007 uint32_t domain; 2008 uint64_t end; 2009 uint64_t length; 2010 uint64_t start; 2011 2012 switch (item->type) { 2013 case SRAT_PROCESSOR: /* CPU entry */ 2014 if (!(item->i.p.flags & SRAT_ENABLED) || 2015 cpu_node == NULL) 2016 break; 2017 2018 /* 2019 * Calculate domain (node) ID and fill in APIC ID to 2020 * domain/node mapping table 2021 */ 2022 domain = item->i.p.domain1; 2023 for (i = 0; i < 3; i++) { 2024 domain += item->i.p.domain2[i] << 2025 ((i + 1) * 8); 2026 } 2027 apic_id = item->i.p.apic_id; 2028 2029 if (lgrp_plat_cpu_node_update(node_domain, cpu_node, 2030 apic_id, domain) < 0) 2031 return (3); 2032 break; 2033 2034 case SRAT_MEMORY: /* memory entry */ 2035 if (!(item->i.m.flags & SRAT_ENABLED) || 2036 node_memory == NULL) 2037 break; 2038 2039 /* 2040 * Get domain (node) ID and fill in domain/node 2041 * to memory mapping table 2042 */ 2043 domain = item->i.m.domain; 2044 start = item->i.m.base_addr; 2045 length = item->i.m.len; 2046 end = start + length - 1; 2047 2048 if (lgrp_plat_node_memory_update(node_domain, 2049 node_memory, start, end, domain) < 0) 2050 return (4); 2051 break; 2052 2053 default: 2054 break; 2055 } 2056 2057 item = (struct srat_item *)((uintptr_t)item + item->len); 2058 } 2059 return (0); 2060 } 2061 2062 2063 /* 2064 * Return number of proximity domains given in ACPI SRAT 2065 */ 2066 static int 2067 lgrp_plat_srat_domains(struct srat *tp) 2068 { 2069 int domain_cnt; 2070 struct srat_item *end; 2071 int i; 2072 struct srat_item *item; 2073 node_domain_map_t node_domain[MAX_NODES]; 2074 2075 2076 if (tp == NULL || !lgrp_plat_srat_enable) 2077 return (1); 2078 2079 /* 2080 * Walk through SRAT, examining each CPU and memory entry to determine 2081 * proximity domain ID for each. 2082 */ 2083 domain_cnt = 0; 2084 item = tp->list; 2085 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2086 bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 2087 while (item < end) { 2088 uint32_t domain; 2089 boolean_t overflow; 2090 uint_t start; 2091 2092 switch (item->type) { 2093 case SRAT_PROCESSOR: /* CPU entry */ 2094 if (!(item->i.p.flags & SRAT_ENABLED)) 2095 break; 2096 domain = item->i.p.domain1; 2097 for (i = 0; i < 3; i++) { 2098 domain += item->i.p.domain2[i] << 2099 ((i + 1) * 8); 2100 } 2101 break; 2102 2103 case SRAT_MEMORY: /* memory entry */ 2104 if (!(item->i.m.flags & SRAT_ENABLED)) 2105 break; 2106 domain = item->i.m.domain; 2107 break; 2108 2109 default: 2110 break; 2111 } 2112 2113 /* 2114 * Count and keep track of which proximity domain IDs seen 2115 */ 2116 start = i = domain % MAX_NODES; 2117 overflow = B_TRUE; 2118 do { 2119 /* 2120 * Create entry for proximity domain and increment 2121 * count when no entry exists where proximity domain 2122 * hashed 2123 */ 2124 if (!node_domain[i].exists) { 2125 node_domain[i].exists = 1; 2126 node_domain[i].prox_domain = domain; 2127 domain_cnt++; 2128 overflow = B_FALSE; 2129 break; 2130 } 2131 2132 /* 2133 * Nothing to do when proximity domain seen already 2134 * and its entry exists 2135 */ 2136 if (node_domain[i].prox_domain == domain) { 2137 overflow = B_FALSE; 2138 break; 2139 } 2140 2141 /* 2142 * Entry exists where proximity domain hashed, but for 2143 * different proximity domain so keep search for empty 2144 * slot to put it or matching entry whichever comes 2145 * first. 2146 */ 2147 i = (i + 1) % MAX_NODES; 2148 } while (i != start); 2149 2150 /* 2151 * Didn't find empty or matching entry which means have more 2152 * proximity domains than supported nodes (:-( 2153 */ 2154 ASSERT(overflow != B_TRUE); 2155 if (overflow == B_TRUE) 2156 return (-1); 2157 2158 item = (struct srat_item *)((uintptr_t)item + item->len); 2159 } 2160 return (domain_cnt); 2161 } 2162 2163 2164 /* 2165 * Set lgroup latencies for 2 level lgroup topology 2166 */ 2167 static void 2168 lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 2169 lgrp_plat_latency_stats_t *lat_stats) 2170 { 2171 int i; 2172 2173 ASSERT(node_memory != NULL && lat_stats != NULL); 2174 2175 if (lgrp_plat_node_cnt >= 4) 2176 cmn_err(CE_NOTE, 2177 "MPO only optimizing for local and remote\n"); 2178 for (i = 0; i < lgrp_plat_node_cnt; i++) { 2179 int j; 2180 2181 if (!node_memory[i].exists) 2182 continue; 2183 for (j = 0; j < lgrp_plat_node_cnt; j++) { 2184 if (!node_memory[j].exists) 2185 continue; 2186 if (i == j) 2187 lat_stats->latencies[i][j] = 2; 2188 else 2189 lat_stats->latencies[i][j] = 3; 2190 } 2191 } 2192 lat_stats->latency_min = 2; 2193 lat_stats->latency_max = 3; 2194 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 2195 } 2196 2197 2198 /* 2199 * The following Opteron specific constants, macros, types, and routines define 2200 * PCI configuration space registers and how to read them to determine the NUMA 2201 * configuration of *supported* Opteron processors. They provide the same 2202 * information that may be gotten from the ACPI System Resource Affinity Table 2203 * (SRAT) if it exists on the machine of interest. 2204 * 2205 * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 2206 * of interest describes all of these registers and their contents. The main 2207 * registers used by this code to determine the NUMA configuration of the 2208 * machine are the node ID register for the number of NUMA nodes and the DRAM 2209 * address map registers for the physical address range of each node. 2210 * 2211 * NOTE: The format and how to determine the NUMA configuration using PCI 2212 * config space registers may change or may not be supported in future 2213 * Opteron processor families. 2214 */ 2215 2216 /* 2217 * How many bits to shift Opteron DRAM Address Map base and limit registers 2218 * to get actual value 2219 */ 2220 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 2221 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 2222 2223 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 2224 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 2225 2226 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 2227 2228 /* 2229 * Macros to derive addresses from Opteron DRAM Address Map registers 2230 */ 2231 #define OPT_DRAMADDR_HI(reg) \ 2232 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 2233 OPT_DRAMADDR_HI_LSHIFT_ADDR) 2234 2235 #define OPT_DRAMADDR_LO(reg) \ 2236 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 2237 OPT_DRAMADDR_LO_LSHIFT_ADDR) 2238 2239 #define OPT_DRAMADDR(high, low) \ 2240 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 2241 2242 /* 2243 * Bit masks defining what's in Opteron DRAM Address Map base register 2244 */ 2245 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 2246 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 2247 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 2248 2249 /* 2250 * Bit masks defining what's in Opteron DRAM Address Map limit register 2251 */ 2252 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 2253 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 2254 2255 2256 /* 2257 * Opteron Node ID register in PCI configuration space contains 2258 * number of nodes in system, etc. for Opteron K8. The following 2259 * constants and macros define its contents, structure, and access. 2260 */ 2261 2262 /* 2263 * Bit masks defining what's in Opteron Node ID register 2264 */ 2265 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 2266 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 2267 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 2268 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 2269 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 2270 2271 /* 2272 * How many bits in Opteron Node ID register to shift right to get actual value 2273 */ 2274 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 2275 2276 /* 2277 * Macros to get values from Opteron Node ID register 2278 */ 2279 #define OPT_NODE_CNT(reg) \ 2280 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 2281 2282 /* 2283 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 2284 * "in/out" instructions 2285 * 2286 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 2287 * other uses should just do MMIO to access PCI ECS. 2288 * Must enable special bit in Northbridge Configuration Register on 2289 * Greyhound for extended CF8 space access to be able to access PCI ECS 2290 * using "in/out" instructions and restore special bit after done 2291 * accessing PCI ECS. 2292 */ 2293 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 2294 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 2295 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 2296 ((((reg) >> 8) & 0xf) << 24)) 2297 2298 /* 2299 * PCI configuration space registers accessed by specifying 2300 * a bus, device, function, and offset. The following constants 2301 * define the values needed to access Opteron K8 configuration 2302 * info to determine its node topology 2303 */ 2304 2305 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 2306 2307 /* 2308 * Opteron PCI configuration space register function values 2309 */ 2310 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 2311 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 2312 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 2313 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 2314 2315 /* 2316 * PCI Configuration Space register offsets 2317 */ 2318 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 2319 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 2320 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 2321 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 2322 2323 /* 2324 * Opteron PCI Configuration Space device IDs for nodes 2325 */ 2326 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 2327 2328 2329 /* 2330 * Opteron DRAM address map gives base and limit for physical memory in a node 2331 */ 2332 typedef struct opt_dram_addr_map { 2333 uint32_t base_hi; 2334 uint32_t base_lo; 2335 uint32_t limit_hi; 2336 uint32_t limit_lo; 2337 } opt_dram_addr_map_t; 2338 2339 2340 /* 2341 * Supported AMD processor families 2342 */ 2343 #define AMD_FAMILY_HAMMER 15 2344 #define AMD_FAMILY_GREYHOUND 16 2345 2346 /* 2347 * Whether to have is_opteron() return 1 even when processor isn't supported 2348 */ 2349 uint_t is_opteron_override = 0; 2350 2351 /* 2352 * AMD processor family for current CPU 2353 */ 2354 uint_t opt_family = 0; 2355 2356 2357 /* 2358 * Determine whether we're running on a supported AMD Opteron since reading 2359 * node count and DRAM address map registers may have different format or 2360 * may not be supported across processor families 2361 */ 2362 static int 2363 is_opteron(void) 2364 { 2365 2366 if (x86_vendor != X86_VENDOR_AMD) 2367 return (0); 2368 2369 opt_family = cpuid_getfamily(CPU); 2370 if (opt_family == AMD_FAMILY_HAMMER || 2371 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 2372 return (1); 2373 else 2374 return (0); 2375 } 2376 2377 2378 /* 2379 * Determine NUMA configuration for Opteron from registers that live in PCI 2380 * configuration space 2381 */ 2382 static void 2383 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 2384 node_phys_addr_map_t *node_memory) 2385 { 2386 uint_t bus; 2387 uint_t dev; 2388 struct opt_dram_addr_map dram_map[MAX_NODES]; 2389 uint_t node; 2390 uint_t node_info[MAX_NODES]; 2391 uint_t off_hi; 2392 uint_t off_lo; 2393 uint64_t nb_cfg_reg; 2394 2395 /* 2396 * Read configuration registers from PCI configuration space to 2397 * determine node information, which memory is in each node, etc. 2398 * 2399 * Write to PCI configuration space address register to specify 2400 * which configuration register to read and read/write PCI 2401 * configuration space data register to get/set contents 2402 */ 2403 bus = OPT_PCS_BUS_CONFIG; 2404 dev = OPT_PCS_DEV_NODE0; 2405 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 2406 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 2407 2408 /* 2409 * Read node ID register for node 0 to get node count 2410 */ 2411 node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 2412 OPT_PCS_OFF_NODEID); 2413 *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 2414 2415 /* 2416 * If number of nodes is more than maximum supported, then set node 2417 * count to 1 and treat system as UMA instead of NUMA. 2418 */ 2419 if (*node_cnt > MAX_NODES) { 2420 *node_cnt = 1; 2421 return; 2422 } 2423 2424 /* 2425 * For Greyhound, PCI Extended Configuration Space must be enabled to 2426 * read high DRAM address map base and limit registers 2427 */ 2428 if (opt_family == AMD_FAMILY_GREYHOUND) { 2429 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 2430 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2431 wrmsr(MSR_AMD_NB_CFG, 2432 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 2433 } 2434 2435 for (node = 0; node < *node_cnt; node++) { 2436 uint32_t base_hi; 2437 uint32_t base_lo; 2438 uint32_t limit_hi; 2439 uint32_t limit_lo; 2440 2441 /* 2442 * Read node ID register (except for node 0 which we just read) 2443 */ 2444 if (node > 0) { 2445 node_info[node] = pci_getl_func(bus, dev, 2446 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 2447 } 2448 2449 /* 2450 * Read DRAM base and limit registers which specify 2451 * physical memory range of each node 2452 */ 2453 if (opt_family != AMD_FAMILY_GREYHOUND) 2454 base_hi = 0; 2455 else { 2456 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2457 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2458 base_hi = dram_map[node].base_hi = 2459 inl(PCI_CONFDATA); 2460 } 2461 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 2462 OPT_PCS_FUNC_ADDRMAP, off_lo); 2463 2464 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 2465 mem_intrlv) 2466 *mem_intrlv = *mem_intrlv + 1; 2467 2468 off_hi += 4; /* high limit register offset */ 2469 if (opt_family != AMD_FAMILY_GREYHOUND) 2470 limit_hi = 0; 2471 else { 2472 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2473 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2474 limit_hi = dram_map[node].limit_hi = 2475 inl(PCI_CONFDATA); 2476 } 2477 2478 off_lo += 4; /* low limit register offset */ 2479 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 2480 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 2481 2482 /* 2483 * Increment device number to next node and register offsets 2484 * for DRAM base register of next node 2485 */ 2486 off_hi += 4; 2487 off_lo += 4; 2488 dev++; 2489 2490 /* 2491 * Both read and write enable bits must be enabled in DRAM 2492 * address map base register for physical memory to exist in 2493 * node 2494 */ 2495 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 2496 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 2497 /* 2498 * Mark node memory as non-existent and set start and 2499 * end addresses to be same in node_memory[] 2500 */ 2501 node_memory[node].exists = 0; 2502 node_memory[node].start = node_memory[node].end = 2503 (pfn_t)-1; 2504 continue; 2505 } 2506 2507 /* 2508 * Mark node memory as existing and remember physical address 2509 * range of each node for use later 2510 */ 2511 node_memory[node].exists = 1; 2512 2513 node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 2514 2515 node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 2516 OPT_DRAMADDR_LO_MASK_OFF); 2517 } 2518 2519 /* 2520 * Restore PCI Extended Configuration Space enable bit 2521 */ 2522 if (opt_family == AMD_FAMILY_GREYHOUND) { 2523 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2524 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 2525 } 2526 } 2527 2528 2529 /* 2530 * Return average amount of time to read vendor ID register on Northbridge 2531 * N times on specified destination node from current CPU 2532 */ 2533 static hrtime_t 2534 opt_probe_vendor(int dest_node, int nreads) 2535 { 2536 int cnt; 2537 uint_t dev; 2538 /* LINTED: set but not used in function */ 2539 volatile uint_t dev_vendor; 2540 hrtime_t elapsed; 2541 hrtime_t end; 2542 int ipl; 2543 hrtime_t start; 2544 2545 dev = OPT_PCS_DEV_NODE0 + dest_node; 2546 kpreempt_disable(); 2547 ipl = spl8(); 2548 outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 2549 OPT_PCS_OFF_VENDOR)); 2550 start = gethrtime(); 2551 for (cnt = 0; cnt < nreads; cnt++) 2552 dev_vendor = inl(PCI_CONFDATA); 2553 end = gethrtime(); 2554 elapsed = (end - start) / nreads; 2555 splx(ipl); 2556 kpreempt_enable(); 2557 return (elapsed); 2558 } 2559