1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #pragma ident "%Z%%M% %I% %E% SMI" 28 29 30 /* 31 * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 32 * ================================================================ 33 * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 34 * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 35 * one or more CPUs and some local memory. The CPUs in each node can access 36 * the memory in the other nodes but at a higher latency than accessing their 37 * local memory. Typically, a system with only one node has Uniform Memory 38 * Access (UMA), but it may be possible to have a one node system that has 39 * some global memory outside of the node which is higher latency. 40 * 41 * Module Description 42 * ------------------ 43 * This module provides a platform interface for determining which CPUs and 44 * which memory (and how much) are in a NUMA node and how far each node is from 45 * each other. The interface is used by the Virtual Memory (VM) system and the 46 * common lgroup framework. The VM system uses the plat_*() routines to fill 47 * in its memory node (memnode) array with the physical address range spanned 48 * by each NUMA node to know which memory belongs to which node, so it can 49 * build and manage a physical page free list for each NUMA node and allocate 50 * local memory from each node as needed. The common lgroup framework uses the 51 * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 52 * to each node (leaf lgroup) and how far each node is from each other, so it 53 * can build the latency (lgroup) topology for the machine in order to optimize 54 * for locality. Also, an lgroup platform handle instead of lgroups are used 55 * in the interface with this module, so this module shouldn't need to know 56 * anything about lgroups. Instead, it just needs to know which CPUs, memory, 57 * etc. are in each NUMA node, how far each node is from each other, and to use 58 * a unique lgroup platform handle to refer to each node through the interface. 59 * 60 * Determining NUMA Configuration 61 * ------------------------------ 62 * By default, this module will try to determine the NUMA configuration of the 63 * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 64 * Locality Information Table (SLIT). The SRAT contains info to tell which 65 * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 66 * is a matrix that gives the distance between each system locality (which is 67 * a NUMA node and should correspond to proximity domains in the SRAT). For 68 * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 69 * specification. 70 * 71 * If the SRAT doesn't exist on a system with AMD Opteron processors, we 72 * examine registers in PCI configuration space to determine how many nodes are 73 * in the system and which CPUs and memory are in each node. 74 * do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is not guaranteed to work or be compatible across all 78 * Opteron processor families. 79 * 80 * If the SLIT does not exist or look right, the kernel will probe to determine 81 * the distance between nodes as long as the NUMA CPU and memory configuration 82 * has been determined (see lgrp_plat_probe() for details). 83 * 84 * Data Structures 85 * --------------- 86 * The main data structures used by this code are the following: 87 * 88 * - lgrp_plat_cpu_node[] CPU to node ID mapping table indexed by 89 * CPU ID (only used for SRAT) 90 * 91 * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 92 * different nodes indexed by node ID 93 * 94 * - lgrp_plat_node_cnt Number of NUMA nodes in system 95 * 96 * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 97 * table indexed by node ID (only used 98 * for SRAT) 99 * 100 * - lgrp_plat_node_memory[] Table with physical address range for 101 * each node indexed by node ID 102 * 103 * The code is implemented to make the following always be true: 104 * 105 * lgroup platform handle == node ID == memnode ID 106 * 107 * Moreover, it allows for the proximity domain ID to be equal to all of the 108 * above as long as the proximity domains IDs are numbered from 0 to <number of 109 * nodes - 1>. This is done by hashing each proximity domain ID into the range 110 * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 111 * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 112 * and be assigned node ID N. If the proximity domain IDs aren't numbered 113 * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 114 * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 115 * to node IDs. However, the proximity domain IDs may not map to the 116 * equivalent node ID since we want to keep the node IDs numbered from 0 to 117 * <number of nodes - 1> to minimize cost of searching and potentially space. 118 */ 119 120 121 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 122 #include <sys/bootconf.h> 123 #include <sys/cmn_err.h> 124 #include <sys/controlregs.h> 125 #include <sys/cpupart.h> 126 #include <sys/cpuvar.h> 127 #include <sys/lgrp.h> 128 #include <sys/machsystm.h> 129 #include <sys/memlist.h> 130 #include <sys/memnode.h> 131 #include <sys/mman.h> 132 #include <sys/pci_cfgspace.h> 133 #include <sys/pci_impl.h> 134 #include <sys/param.h> 135 #include <sys/pghw.h> 136 #include <sys/promif.h> /* for prom_printf() */ 137 #include <sys/sysmacros.h> 138 #include <sys/systm.h> 139 #include <sys/thread.h> 140 #include <sys/types.h> 141 #include <sys/var.h> 142 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 143 #include <vm/hat_i86.h> 144 #include <vm/seg_kmem.h> 145 #include <vm/vm_dep.h> 146 147 #include "acpi_fw.h" /* for SRAT and SLIT */ 148 149 150 #define MAX_NODES 8 151 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 152 153 /* 154 * Constants for configuring probing 155 */ 156 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 157 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 158 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 159 160 /* 161 * Flags for probing 162 */ 163 #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 164 #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 165 #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 166 167 /* 168 * Hash proximity domain ID into node to domain mapping table using to minimize 169 * span of entries used 170 */ 171 #define NODE_DOMAIN_HASH(domain, node_cnt) ((domain) % node_cnt) 172 173 174 /* 175 * CPU to node ID mapping structure (only used with SRAT) 176 */ 177 typedef struct cpu_node_map { 178 int exists; 179 uint_t node; 180 uint32_t apicid; 181 uint32_t prox_domain; 182 } cpu_node_map_t; 183 184 /* 185 * Latency statistics 186 */ 187 typedef struct lgrp_plat_latency_stats { 188 hrtime_t latencies[MAX_NODES][MAX_NODES]; 189 hrtime_t latency_max; 190 hrtime_t latency_min; 191 } lgrp_plat_latency_stats_t; 192 193 /* 194 * Memory configuration for probing 195 */ 196 typedef struct lgrp_plat_probe_mem_config { 197 size_t probe_memsize; /* how much memory to probe per node */ 198 caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 199 pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 200 } lgrp_plat_probe_mem_config_t; 201 202 /* 203 * Statistics kept for probing 204 */ 205 typedef struct lgrp_plat_probe_stats { 206 hrtime_t flush_cost; 207 hrtime_t probe_cost; 208 hrtime_t probe_cost_total; 209 hrtime_t probe_error_code; 210 hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 211 int probe_suspect[MAX_NODES][MAX_NODES]; 212 hrtime_t probe_max[MAX_NODES][MAX_NODES]; 213 hrtime_t probe_min[MAX_NODES][MAX_NODES]; 214 } lgrp_plat_probe_stats_t; 215 216 /* 217 * Node to proximity domain ID mapping structure (only used with SRAT) 218 */ 219 typedef struct node_domain_map { 220 int exists; 221 uint32_t prox_domain; 222 } node_domain_map_t; 223 224 /* 225 * Node ID and starting and ending page for physical memory in node 226 */ 227 typedef struct node_phys_addr_map { 228 pfn_t start; 229 pfn_t end; 230 int exists; 231 uint32_t prox_domain; 232 } node_phys_addr_map_t; 233 234 /* 235 * Number of CPUs for which we got APIC IDs 236 */ 237 static int lgrp_plat_apic_ncpus = 0; 238 239 /* 240 * CPU to node ID mapping table (only used for SRAT) 241 */ 242 static cpu_node_map_t lgrp_plat_cpu_node[NCPU]; 243 244 /* 245 * Latency statistics 246 */ 247 lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 248 249 /* 250 * Whether memory is interleaved across nodes causing MPO to be disabled 251 */ 252 static int lgrp_plat_mem_intrlv = 0; 253 254 /* 255 * Node ID to proximity domain ID mapping table (only used for SRAT) 256 */ 257 static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 258 259 /* 260 * Physical address range for memory in each node 261 */ 262 static node_phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 263 264 /* 265 * Statistics gotten from probing 266 */ 267 static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 268 269 /* 270 * Memory configuration for probing 271 */ 272 static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 273 274 /* 275 * Error code from processing ACPI SRAT 276 */ 277 static int lgrp_plat_srat_error = 0; 278 279 /* 280 * Error code from processing ACPI SLIT 281 */ 282 static int lgrp_plat_slit_error = 0; 283 284 /* 285 * Allocate lgroup array statically 286 */ 287 static lgrp_t lgrp_space[NLGRP]; 288 static int nlgrps_alloc; 289 290 291 /* 292 * Number of nodes in system 293 */ 294 uint_t lgrp_plat_node_cnt = 1; 295 296 /* 297 * Configuration Parameters for Probing 298 * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 299 * operation, etc. 300 * - lgrp_plat_probe_nrounds How many rounds of probing to do 301 * - lgrp_plat_probe_nsamples Number of samples to take when probing each 302 * node 303 * - lgrp_plat_probe_nreads Number of times to read vendor ID from 304 * Northbridge for each probe 305 */ 306 uint_t lgrp_plat_probe_flags = 0; 307 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 308 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 309 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 310 311 /* 312 * Enable use of ACPI System Resource Affinity Table (SRAT) and System 313 * Locality Information Table (SLIT) 314 */ 315 int lgrp_plat_srat_enable = 1; 316 int lgrp_plat_slit_enable = 1; 317 318 /* 319 * Static array to hold lgroup statistics 320 */ 321 struct lgrp_stats lgrp_stats[NLGRP]; 322 323 324 /* 325 * Forward declarations of platform interface routines 326 */ 327 void plat_build_mem_nodes(struct memlist *list); 328 329 int plat_lgrphand_to_mem_node(lgrp_handle_t hand); 330 331 lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 332 333 int plat_mnode_xcheck(pfn_t pfncnt); 334 335 int plat_pfn_to_mem_node(pfn_t pfn); 336 337 /* 338 * Forward declarations of lgroup platform interface routines 339 */ 340 lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 341 342 void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 343 344 lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 345 346 void lgrp_plat_init(void); 347 348 int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 349 350 void lgrp_plat_main_init(void); 351 352 int lgrp_plat_max_lgrps(void); 353 354 pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 355 lgrp_mem_query_t query); 356 357 lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 358 359 void lgrp_plat_probe(void); 360 361 lgrp_handle_t lgrp_plat_root_hand(void); 362 363 364 /* 365 * Forward declarations of local routines 366 */ 367 static int is_opteron(void); 368 369 static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 370 int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, 371 uint32_t domain); 372 373 static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node); 374 375 static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 376 int node_cnt, uint32_t domain); 377 378 static void lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 379 lgrp_plat_latency_stats_t *lat_stats, 380 lgrp_plat_probe_stats_t *probe_stats); 381 382 static int lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 383 lgrp_plat_latency_stats_t *lat_stats); 384 385 static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 386 387 static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 388 int node_cnt, uint32_t domain); 389 390 static int lgrp_plat_node_memory_update(node_domain_map_t *node_domain, 391 int node_cnt, node_phys_addr_map_t *node_memory, uint64_t start, 392 uint64_t end, uint32_t domain); 393 394 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 395 lgrp_plat_probe_mem_config_t *probe_mem_config, 396 lgrp_plat_latency_stats_t *lat_stats, 397 lgrp_plat_probe_stats_t *probe_stats); 398 399 static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); 400 401 static int lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 402 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats); 403 404 static int lgrp_plat_process_srat(struct srat *tp, 405 node_domain_map_t *node_domain, cpu_node_map_t *cpu_node, int cpu_count, 406 node_phys_addr_map_t *node_memory); 407 408 static int lgrp_plat_srat_domains(struct srat *tp); 409 410 static void lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 411 lgrp_plat_latency_stats_t *lat_stats); 412 413 static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 414 node_phys_addr_map_t *node_memory); 415 416 static hrtime_t opt_probe_vendor(int dest_node, int nreads); 417 418 419 /* 420 * PLATFORM INTERFACE ROUTINES 421 */ 422 423 /* 424 * Configure memory nodes for machines with more than one node (ie NUMA) 425 */ 426 void 427 plat_build_mem_nodes(struct memlist *list) 428 { 429 pfn_t cur_start; /* start addr of subrange */ 430 pfn_t cur_end; /* end addr of subrange */ 431 pfn_t start; /* start addr of whole range */ 432 pfn_t end; /* end addr of whole range */ 433 434 /* 435 * Boot install lists are arranged <addr, len>, ... 436 */ 437 while (list) { 438 int node; 439 440 start = list->address >> PAGESHIFT; 441 end = (list->address + list->size - 1) >> PAGESHIFT; 442 443 if (start > physmax) { 444 list = list->next; 445 continue; 446 } 447 if (end > physmax) 448 end = physmax; 449 450 /* 451 * When there is only one memnode, just add memory to memnode 452 */ 453 if (max_mem_nodes == 1) { 454 mem_node_add_slice(start, end); 455 list = list->next; 456 continue; 457 } 458 459 /* 460 * mem_node_add_slice() expects to get a memory range that 461 * is within one memnode, so need to split any memory range 462 * that spans multiple memnodes into subranges that are each 463 * contained within one memnode when feeding them to 464 * mem_node_add_slice() 465 */ 466 cur_start = start; 467 do { 468 node = plat_pfn_to_mem_node(cur_start); 469 470 /* 471 * Panic if DRAM address map registers or SRAT say 472 * memory in node doesn't exist or address from 473 * boot installed memory list entry isn't in this node. 474 * This shouldn't happen and rest of code can't deal 475 * with this if it does. 476 */ 477 if (node < 0 || node >= lgrp_plat_node_cnt || 478 !lgrp_plat_node_memory[node].exists || 479 cur_start < lgrp_plat_node_memory[node].start || 480 cur_start > lgrp_plat_node_memory[node].end) { 481 cmn_err(CE_PANIC, "Don't know which memnode " 482 "to add installed memory address 0x%lx\n", 483 cur_start); 484 } 485 486 /* 487 * End of current subrange should not span memnodes 488 */ 489 cur_end = end; 490 if (lgrp_plat_node_memory[node].exists && 491 cur_end > lgrp_plat_node_memory[node].end) 492 cur_end = lgrp_plat_node_memory[node].end; 493 494 mem_node_add_slice(cur_start, cur_end); 495 496 /* 497 * Next subrange starts after end of current one 498 */ 499 cur_start = cur_end + 1; 500 } while (cur_end < end); 501 502 list = list->next; 503 } 504 mem_node_physalign = 0; 505 mem_node_pfn_shift = 0; 506 } 507 508 509 int 510 plat_lgrphand_to_mem_node(lgrp_handle_t hand) 511 { 512 if (max_mem_nodes == 1) 513 return (0); 514 515 return ((int)hand); 516 } 517 518 519 /* 520 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 521 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 522 * a crossing is found and returns 0 otherwise. 523 */ 524 int 525 plat_mnode_xcheck(pfn_t pfncnt) 526 { 527 int node, prevnode = -1, basenode; 528 pfn_t ea, sa; 529 530 for (node = 0; node < lgrp_plat_node_cnt; node++) { 531 532 if (lgrp_plat_node_memory[node].exists == 0) 533 continue; 534 535 if (prevnode == -1) { 536 prevnode = node; 537 basenode = node; 538 continue; 539 } 540 541 /* assume x86 node pfn ranges are in increasing order */ 542 ASSERT(lgrp_plat_node_memory[node].start > 543 lgrp_plat_node_memory[prevnode].end); 544 545 /* 546 * continue if the starting address of node is not contiguous 547 * with the previous node. 548 */ 549 550 if (lgrp_plat_node_memory[node].start != 551 (lgrp_plat_node_memory[prevnode].end + 1)) { 552 basenode = node; 553 prevnode = node; 554 continue; 555 } 556 557 /* check if the starting address of node is pfncnt aligned */ 558 if ((lgrp_plat_node_memory[node].start & (pfncnt - 1)) != 0) { 559 560 /* 561 * at this point, node starts at an unaligned boundary 562 * and is contiguous with the previous node(s) to 563 * basenode. Check if there is an aligned contiguous 564 * range of length pfncnt that crosses this boundary. 565 */ 566 567 sa = P2ALIGN(lgrp_plat_node_memory[prevnode].end, 568 pfncnt); 569 ea = P2ROUNDUP((lgrp_plat_node_memory[node].start), 570 pfncnt); 571 572 ASSERT((ea - sa) == pfncnt); 573 if (sa >= lgrp_plat_node_memory[basenode].start && 574 ea <= (lgrp_plat_node_memory[node].end + 1)) 575 return (1); 576 } 577 prevnode = node; 578 } 579 return (0); 580 } 581 582 583 lgrp_handle_t 584 plat_mem_node_to_lgrphand(int mnode) 585 { 586 if (max_mem_nodes == 1) 587 return (LGRP_DEFAULT_HANDLE); 588 589 return ((lgrp_handle_t)mnode); 590 } 591 592 593 int 594 plat_pfn_to_mem_node(pfn_t pfn) 595 { 596 int node; 597 598 if (max_mem_nodes == 1) 599 return (0); 600 601 for (node = 0; node < lgrp_plat_node_cnt; node++) { 602 /* 603 * Skip nodes with no memory 604 */ 605 if (!lgrp_plat_node_memory[node].exists) 606 continue; 607 608 if (pfn >= lgrp_plat_node_memory[node].start && 609 pfn <= lgrp_plat_node_memory[node].end) 610 return (node); 611 } 612 613 /* 614 * Didn't find memnode where this PFN lives which should never happen 615 */ 616 ASSERT(node < lgrp_plat_node_cnt); 617 return (-1); 618 } 619 620 621 /* 622 * LGROUP PLATFORM INTERFACE ROUTINES 623 */ 624 625 /* 626 * Allocate additional space for an lgroup. 627 */ 628 /* ARGSUSED */ 629 lgrp_t * 630 lgrp_plat_alloc(lgrp_id_t lgrpid) 631 { 632 lgrp_t *lgrp; 633 634 lgrp = &lgrp_space[nlgrps_alloc++]; 635 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 636 return (NULL); 637 return (lgrp); 638 } 639 640 641 /* 642 * Platform handling for (re)configuration changes 643 */ 644 /* ARGSUSED */ 645 void 646 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 647 { 648 } 649 650 651 /* 652 * Return the platform handle for the lgroup containing the given CPU 653 */ 654 /* ARGSUSED */ 655 lgrp_handle_t 656 lgrp_plat_cpu_to_hand(processorid_t id) 657 { 658 lgrp_handle_t hand; 659 660 if (lgrp_plat_node_cnt == 1) 661 return (LGRP_DEFAULT_HANDLE); 662 663 hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 664 lgrp_plat_cpu_node); 665 666 ASSERT(hand != (lgrp_handle_t)-1); 667 if (hand == (lgrp_handle_t)-1) 668 return (LGRP_NULL_HANDLE); 669 670 return (hand); 671 } 672 673 674 /* 675 * Platform-specific initialization of lgroups 676 */ 677 void 678 lgrp_plat_init(void) 679 { 680 #if defined(__xpv) 681 /* 682 * XXPV For now, the hypervisor treats all memory equally. 683 */ 684 lgrp_plat_node_cnt = max_mem_nodes = 1; 685 #else /* __xpv */ 686 uint_t probe_op; 687 688 /* 689 * Initialize as a UMA machine 690 */ 691 if (lgrp_topo_ht_limit() == 1) { 692 lgrp_plat_node_cnt = max_mem_nodes = 1; 693 return; 694 } 695 696 /* 697 * Read boot property with CPU to APIC ID mapping table/array and fill 698 * in CPU to node ID mapping table with APIC ID for each CPU 699 */ 700 lgrp_plat_apic_ncpus = 701 lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); 702 703 /* 704 * Determine which CPUs and memory are local to each other and number 705 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 706 */ 707 if (lgrp_plat_apic_ncpus > 0) { 708 int retval; 709 710 retval = lgrp_plat_process_srat(srat_ptr, 711 lgrp_plat_node_domain, lgrp_plat_cpu_node, 712 lgrp_plat_apic_ncpus, lgrp_plat_node_memory); 713 if (retval <= 0) { 714 lgrp_plat_srat_error = retval; 715 lgrp_plat_node_cnt = 1; 716 } else { 717 lgrp_plat_srat_error = 0; 718 lgrp_plat_node_cnt = retval; 719 } 720 } 721 722 /* 723 * Try to use PCI config space registers on Opteron if there's an error 724 * processing CPU to APIC ID mapping or SRAT 725 */ 726 if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && 727 is_opteron()) 728 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 729 lgrp_plat_node_memory); 730 731 /* 732 * Don't bother to setup system for multiple lgroups and only use one 733 * memory node when memory is interleaved between any nodes or there is 734 * only one NUMA node 735 * 736 * NOTE: May need to change this for Dynamic Reconfiguration (DR) 737 * when and if it happens for x86/x64 738 */ 739 if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 740 lgrp_plat_node_cnt = max_mem_nodes = 1; 741 (void) lgrp_topo_ht_limit_set(1); 742 return; 743 } 744 745 /* 746 * Leaf lgroups on x86/x64 architectures contain one physical 747 * processor chip. Tune lgrp_expand_proc_thresh and 748 * lgrp_expand_proc_diff so that lgrp_choose() will spread 749 * things out aggressively. 750 */ 751 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 752 lgrp_expand_proc_diff = 0; 753 754 /* 755 * There should be one memnode (physical page free list(s)) for 756 * each node 757 */ 758 max_mem_nodes = lgrp_plat_node_cnt; 759 760 /* 761 * Initialize min and max latency before reading SLIT or probing 762 */ 763 lgrp_plat_lat_stats.latency_min = -1; 764 lgrp_plat_lat_stats.latency_max = 0; 765 766 /* 767 * Determine how far each NUMA node is from each other by 768 * reading ACPI System Locality Information Table (SLIT) if it 769 * exists 770 */ 771 lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 772 lgrp_plat_node_cnt, lgrp_plat_node_memory, 773 &lgrp_plat_lat_stats); 774 if (lgrp_plat_slit_error == 0) 775 return; 776 777 /* 778 * Probe to determine latency between NUMA nodes when SLIT 779 * doesn't exist or make sense 780 */ 781 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 782 783 /* 784 * Specify whether to probe using vendor ID register or page copy 785 * if hasn't been specified already or is overspecified 786 */ 787 probe_op = lgrp_plat_probe_flags & 788 (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 789 790 if (probe_op == 0 || 791 probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 792 lgrp_plat_probe_flags &= 793 ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 794 if (is_opteron()) 795 lgrp_plat_probe_flags |= 796 LGRP_PLAT_PROBE_VENDOR; 797 else 798 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 799 } 800 801 /* 802 * Probing errors can mess up the lgroup topology and 803 * force us fall back to a 2 level lgroup topology. 804 * Here we bound how tall the lgroup topology can grow 805 * in hopes of avoiding any anamolies in probing from 806 * messing up the lgroup topology by limiting the 807 * accuracy of the latency topology. 808 * 809 * Assume that nodes will at least be configured in a 810 * ring, so limit height of lgroup topology to be less 811 * than number of nodes on a system with 4 or more 812 * nodes 813 */ 814 if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 815 lgrp_topo_ht_limit_default()) 816 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 817 #endif /* __xpv */ 818 } 819 820 821 /* 822 * Return latency between "from" and "to" lgroups 823 * 824 * This latency number can only be used for relative comparison 825 * between lgroups on the running system, cannot be used across platforms, 826 * and may not reflect the actual latency. It is platform and implementation 827 * specific, so platform gets to decide its value. It would be nice if the 828 * number was at least proportional to make comparisons more meaningful though. 829 */ 830 /* ARGSUSED */ 831 int 832 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 833 { 834 lgrp_handle_t src, dest; 835 int node; 836 837 if (max_mem_nodes == 1) 838 return (0); 839 840 /* 841 * Return max latency for root lgroup 842 */ 843 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 844 return (lgrp_plat_lat_stats.latency_max); 845 846 src = from; 847 dest = to; 848 849 /* 850 * Return 0 for nodes (lgroup platform handles) out of range 851 */ 852 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 853 return (0); 854 855 /* 856 * Probe from current CPU if its lgroup latencies haven't been set yet 857 * and we are trying to get latency from current CPU to some node 858 */ 859 node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 860 ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 861 if (lgrp_plat_lat_stats.latencies[src][src] == 0 && node == src) 862 lgrp_plat_probe(); 863 864 return (lgrp_plat_lat_stats.latencies[src][dest]); 865 } 866 867 868 /* 869 * Platform-specific initialization 870 */ 871 void 872 lgrp_plat_main_init(void) 873 { 874 int curnode; 875 int ht_limit; 876 int i; 877 878 /* 879 * Print a notice that MPO is disabled when memory is interleaved 880 * across nodes....Would do this when it is discovered, but can't 881 * because it happens way too early during boot.... 882 */ 883 if (lgrp_plat_mem_intrlv) 884 cmn_err(CE_NOTE, 885 "MPO disabled because memory is interleaved\n"); 886 887 /* 888 * Don't bother to do any probing if it is disabled, there is only one 889 * node, or the height of the lgroup topology less than or equal to 2 890 */ 891 ht_limit = lgrp_topo_ht_limit(); 892 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 893 max_mem_nodes == 1 || ht_limit <= 2) { 894 /* 895 * Setup lgroup latencies for 2 level lgroup topology 896 * (ie. local and remote only) if they haven't been set yet 897 */ 898 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 899 lgrp_plat_lat_stats.latency_max == 0) 900 lgrp_plat_2level_setup(lgrp_plat_node_memory, 901 &lgrp_plat_lat_stats); 902 return; 903 } 904 905 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 906 /* 907 * Should have been able to probe from CPU 0 when it was added 908 * to lgroup hierarchy, but may not have been able to then 909 * because it happens so early in boot that gethrtime() hasn't 910 * been initialized. (:-( 911 */ 912 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 913 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 914 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 915 lgrp_plat_probe(); 916 917 return; 918 } 919 920 /* 921 * When probing memory, use one page for every sample to determine 922 * lgroup topology and taking multiple samples 923 */ 924 if (lgrp_plat_probe_mem_config.probe_memsize == 0) 925 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 926 lgrp_plat_probe_nsamples; 927 928 /* 929 * Map memory in each node needed for probing to determine latency 930 * topology 931 */ 932 for (i = 0; i < lgrp_plat_node_cnt; i++) { 933 int mnode; 934 935 /* 936 * Skip this node and leave its probe page NULL 937 * if it doesn't have any memory 938 */ 939 mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 940 if (!mem_node_config[mnode].exists) { 941 lgrp_plat_probe_mem_config.probe_va[i] = NULL; 942 continue; 943 } 944 945 /* 946 * Allocate one kernel virtual page 947 */ 948 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 949 lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 950 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 951 cmn_err(CE_WARN, 952 "lgrp_plat_main_init: couldn't allocate memory"); 953 return; 954 } 955 956 /* 957 * Get PFN for first page in each node 958 */ 959 lgrp_plat_probe_mem_config.probe_pfn[i] = 960 mem_node_config[mnode].physbase; 961 962 /* 963 * Map virtual page to first page in node 964 */ 965 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 966 lgrp_plat_probe_mem_config.probe_memsize, 967 lgrp_plat_probe_mem_config.probe_pfn[i], 968 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 969 HAT_LOAD_NOCONSIST); 970 } 971 972 /* 973 * Probe from current CPU 974 */ 975 lgrp_plat_probe(); 976 } 977 978 979 /* 980 * Return the maximum number of lgrps supported by the platform. 981 * Before lgrp topology is known it returns an estimate based on the number of 982 * nodes. Once topology is known it returns the actual maximim number of lgrps 983 * created. Since x86/x64 doesn't support Dynamic Reconfiguration (DR) and 984 * dynamic addition of new nodes, this number may not grow during system 985 * lifetime (yet). 986 */ 987 int 988 lgrp_plat_max_lgrps(void) 989 { 990 return (lgrp_topo_initialized ? 991 lgrp_alloc_max + 1 : 992 lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 993 } 994 995 996 /* 997 * Return the number of free pages in an lgroup. 998 * 999 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1000 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1001 * number of allocatable base pagesize pages corresponding to the 1002 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1003 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1004 * memory installed, regardless of whether or not it's usable. 1005 */ 1006 pgcnt_t 1007 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1008 { 1009 int mnode; 1010 pgcnt_t npgs = (pgcnt_t)0; 1011 extern struct memlist *phys_avail; 1012 extern struct memlist *phys_install; 1013 1014 1015 if (plathand == LGRP_DEFAULT_HANDLE) 1016 return (lgrp_plat_mem_size_default(plathand, query)); 1017 1018 if (plathand != LGRP_NULL_HANDLE) { 1019 mnode = plat_lgrphand_to_mem_node(plathand); 1020 if (mnode >= 0 && mem_node_config[mnode].exists) { 1021 switch (query) { 1022 case LGRP_MEM_SIZE_FREE: 1023 npgs = MNODE_PGCNT(mnode); 1024 break; 1025 case LGRP_MEM_SIZE_AVAIL: 1026 npgs = mem_node_memlist_pages(mnode, 1027 phys_avail); 1028 break; 1029 case LGRP_MEM_SIZE_INSTALL: 1030 npgs = mem_node_memlist_pages(mnode, 1031 phys_install); 1032 break; 1033 default: 1034 break; 1035 } 1036 } 1037 } 1038 return (npgs); 1039 } 1040 1041 1042 /* 1043 * Return the platform handle of the lgroup that contains the physical memory 1044 * corresponding to the given page frame number 1045 */ 1046 /* ARGSUSED */ 1047 lgrp_handle_t 1048 lgrp_plat_pfn_to_hand(pfn_t pfn) 1049 { 1050 int mnode; 1051 1052 if (max_mem_nodes == 1) 1053 return (LGRP_DEFAULT_HANDLE); 1054 1055 if (pfn > physmax) 1056 return (LGRP_NULL_HANDLE); 1057 1058 mnode = plat_pfn_to_mem_node(pfn); 1059 if (mnode < 0) 1060 return (LGRP_NULL_HANDLE); 1061 1062 return (MEM_NODE_2_LGRPHAND(mnode)); 1063 } 1064 1065 1066 /* 1067 * Probe memory in each node from current CPU to determine latency topology 1068 * 1069 * The probing code will probe the vendor ID register on the Northbridge of 1070 * Opteron processors and probe memory for other processors by default. 1071 * 1072 * Since probing is inherently error prone, the code takes laps across all the 1073 * nodes probing from each node to each of the other nodes some number of 1074 * times. Furthermore, each node is probed some number of times before moving 1075 * onto the next one during each lap. The minimum latency gotten between nodes 1076 * is kept as the latency between the nodes. 1077 * 1078 * After all that, the probe times are adjusted by normalizing values that are 1079 * close to each other and local latencies are made the same. Lastly, the 1080 * latencies are verified to make sure that certain conditions are met (eg. 1081 * local < remote, latency(a, b) == latency(b, a), etc.). 1082 * 1083 * If any of the conditions aren't met, the code will export a NUMA 1084 * configuration with the local CPUs and memory given by the SRAT or PCI config 1085 * space registers and one remote memory latency since it can't tell exactly 1086 * how far each node is from each other. 1087 */ 1088 void 1089 lgrp_plat_probe(void) 1090 { 1091 int from; 1092 int i; 1093 lgrp_plat_latency_stats_t *lat_stats; 1094 hrtime_t probe_time; 1095 int to; 1096 1097 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1098 max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1099 return; 1100 1101 /* 1102 * Determine ID of node containing current CPU 1103 */ 1104 from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node); 1105 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1106 if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 1107 ASSERT(lgrp_plat_node_domain[from].exists); 1108 1109 /* 1110 * Don't need to probe if got times already 1111 */ 1112 lat_stats = &lgrp_plat_lat_stats; 1113 if (lat_stats->latencies[from][from] != 0) 1114 return; 1115 1116 /* 1117 * Read vendor ID in Northbridge or read and write page(s) 1118 * in each node from current CPU and remember how long it takes, 1119 * so we can build latency topology of machine later. 1120 * This should approximate the memory latency between each node. 1121 */ 1122 for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1123 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1124 /* 1125 * Get probe time and bail out if can't get it yet 1126 */ 1127 probe_time = lgrp_plat_probe_time(to, 1128 lgrp_plat_cpu_node, &lgrp_plat_probe_mem_config, 1129 &lgrp_plat_lat_stats, &lgrp_plat_probe_stats); 1130 if (probe_time == 0) 1131 return; 1132 1133 /* 1134 * Keep lowest probe time as latency between nodes 1135 */ 1136 if (lat_stats->latencies[from][to] == 0 || 1137 probe_time < lat_stats->latencies[from][to]) 1138 lat_stats->latencies[from][to] = probe_time; 1139 1140 /* 1141 * Update overall minimum and maximum probe times 1142 * across all nodes 1143 */ 1144 if (probe_time < lat_stats->latency_min || 1145 lat_stats->latency_min == -1) 1146 lat_stats->latency_min = probe_time; 1147 if (probe_time > lat_stats->latency_max) 1148 lat_stats->latency_max = probe_time; 1149 } 1150 } 1151 1152 /* 1153 * - Fix up latencies such that local latencies are same, 1154 * latency(i, j) == latency(j, i), etc. (if possible) 1155 * 1156 * - Verify that latencies look ok 1157 * 1158 * - Fallback to just optimizing for local and remote if 1159 * latencies didn't look right 1160 */ 1161 lgrp_plat_latency_adjust(lgrp_plat_node_memory, &lgrp_plat_lat_stats, 1162 &lgrp_plat_probe_stats); 1163 lgrp_plat_probe_stats.probe_error_code = 1164 lgrp_plat_latency_verify(lgrp_plat_node_memory, 1165 &lgrp_plat_lat_stats); 1166 if (lgrp_plat_probe_stats.probe_error_code) 1167 lgrp_plat_2level_setup(lgrp_plat_node_memory, 1168 &lgrp_plat_lat_stats); 1169 } 1170 1171 1172 /* 1173 * Return platform handle for root lgroup 1174 */ 1175 lgrp_handle_t 1176 lgrp_plat_root_hand(void) 1177 { 1178 return (LGRP_DEFAULT_HANDLE); 1179 } 1180 1181 1182 /* 1183 * INTERNAL ROUTINES 1184 */ 1185 1186 1187 /* 1188 * Update CPU to node mapping for given CPU and proximity domain (and returns 1189 * negative numbers for errors and positive ones for success) 1190 */ 1191 static int 1192 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, 1193 cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain) 1194 { 1195 uint_t i; 1196 int node; 1197 1198 /* 1199 * Get node number for proximity domain 1200 */ 1201 node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 1202 if (node == -1) { 1203 node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1204 domain); 1205 if (node == -1) 1206 return (-1); 1207 } 1208 1209 /* 1210 * Search for entry with given APIC ID and fill in its node and 1211 * proximity domain IDs (if they haven't been set already) 1212 */ 1213 for (i = 0; i < nentries; i++) { 1214 /* 1215 * Skip nonexistent entries and ones without matching APIC ID 1216 */ 1217 if (!cpu_node[i].exists || cpu_node[i].apicid != apicid) 1218 continue; 1219 1220 /* 1221 * Just return if entry completely and correctly filled in 1222 * already 1223 */ 1224 if (cpu_node[i].prox_domain == domain && 1225 cpu_node[i].node == node) 1226 return (1); 1227 1228 /* 1229 * Fill in node and proximity domain IDs 1230 */ 1231 cpu_node[i].prox_domain = domain; 1232 cpu_node[i].node = node; 1233 1234 return (0); 1235 } 1236 1237 /* 1238 * Return error when entry for APIC ID wasn't found in table 1239 */ 1240 return (-2); 1241 } 1242 1243 1244 /* 1245 * Get node ID for given CPU 1246 */ 1247 static int 1248 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node) 1249 { 1250 processorid_t cpuid; 1251 1252 if (cp == NULL) 1253 return (-1); 1254 1255 cpuid = cp->cpu_id; 1256 if (cpuid < 0 || cpuid >= max_ncpus) 1257 return (-1); 1258 1259 /* 1260 * SRAT doesn't exist, isn't enabled, or there was an error processing 1261 * it, so return chip ID for Opteron and -1 otherwise. 1262 */ 1263 if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1264 lgrp_plat_srat_error) { 1265 if (is_opteron()) 1266 return (pg_plat_hw_instance_id(cp, PGHW_CHIP)); 1267 return (-1); 1268 } 1269 1270 /* 1271 * Return -1 when CPU to node ID mapping entry doesn't exist for given 1272 * CPU 1273 */ 1274 if (!cpu_node[cpuid].exists) 1275 return (-1); 1276 1277 return (cpu_node[cpuid].node); 1278 } 1279 1280 1281 /* 1282 * Return node number for given proximity domain/system locality 1283 */ 1284 static int 1285 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, 1286 uint32_t domain) 1287 { 1288 uint_t node; 1289 uint_t start; 1290 1291 /* 1292 * Hash proximity domain ID into node to domain mapping table (array), 1293 * search for entry with matching proximity domain ID, and return index 1294 * of matching entry as node ID. 1295 */ 1296 node = start = NODE_DOMAIN_HASH(domain, node_cnt); 1297 do { 1298 if (node_domain[node].prox_domain == domain && 1299 node_domain[node].exists) 1300 return (node); 1301 node = NODE_DOMAIN_HASH(node + 1, node_cnt); 1302 } while (node != start); 1303 return (-1); 1304 } 1305 1306 1307 /* 1308 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1309 * be considered same 1310 */ 1311 #define LGRP_LAT_TOLERANCE_SHIFT 4 1312 1313 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1314 1315 1316 /* 1317 * Adjust latencies between nodes to be symmetric, normalize latencies between 1318 * any nodes that are within some tolerance to be same, and make local 1319 * latencies be same 1320 */ 1321 static void 1322 lgrp_plat_latency_adjust(node_phys_addr_map_t *node_memory, 1323 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1324 { 1325 int i; 1326 int j; 1327 int k; 1328 int l; 1329 u_longlong_t max; 1330 u_longlong_t min; 1331 u_longlong_t t; 1332 u_longlong_t t1; 1333 u_longlong_t t2; 1334 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1335 int lat_corrected[MAX_NODES][MAX_NODES]; 1336 1337 /* 1338 * Nothing to do when this is an UMA machine or don't have args needed 1339 */ 1340 if (max_mem_nodes == 1) 1341 return; 1342 1343 ASSERT(node_memory != NULL && lat_stats != NULL && 1344 probe_stats != NULL); 1345 1346 /* 1347 * Make sure that latencies are symmetric between any two nodes 1348 * (ie. latency(node0, node1) == latency(node1, node0)) 1349 */ 1350 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1351 if (!node_memory[i].exists) 1352 continue; 1353 1354 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1355 if (!node_memory[j].exists) 1356 continue; 1357 1358 t1 = lat_stats->latencies[i][j]; 1359 t2 = lat_stats->latencies[j][i]; 1360 1361 if (t1 == 0 || t2 == 0 || t1 == t2) 1362 continue; 1363 1364 /* 1365 * Latencies should be same 1366 * - Use minimum of two latencies which should be same 1367 * - Track suspect probe times not within tolerance of 1368 * min value 1369 * - Remember how much values are corrected by 1370 */ 1371 if (t1 > t2) { 1372 t = t2; 1373 probe_stats->probe_errors[i][j] += t1 - t2; 1374 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1375 probe_stats->probe_suspect[i][j]++; 1376 probe_stats->probe_suspect[j][i]++; 1377 } 1378 } else if (t2 > t1) { 1379 t = t1; 1380 probe_stats->probe_errors[j][i] += t2 - t1; 1381 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1382 probe_stats->probe_suspect[i][j]++; 1383 probe_stats->probe_suspect[j][i]++; 1384 } 1385 } 1386 1387 lat_stats->latencies[i][j] = 1388 lat_stats->latencies[j][i] = t; 1389 lgrp_config(cflag, t1, t); 1390 lgrp_config(cflag, t2, t); 1391 } 1392 } 1393 1394 /* 1395 * Keep track of which latencies get corrected 1396 */ 1397 for (i = 0; i < MAX_NODES; i++) 1398 for (j = 0; j < MAX_NODES; j++) 1399 lat_corrected[i][j] = 0; 1400 1401 /* 1402 * For every two nodes, see whether there is another pair of nodes which 1403 * are about the same distance apart and make the latencies be the same 1404 * if they are close enough together 1405 */ 1406 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1407 if (!node_memory[i].exists) 1408 continue; 1409 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1410 if (!node_memory[j].exists) 1411 continue; 1412 /* 1413 * Pick one pair of nodes (i, j) 1414 * and get latency between them 1415 */ 1416 t1 = lat_stats->latencies[i][j]; 1417 1418 /* 1419 * Skip this pair of nodes if there isn't a latency 1420 * for it yet 1421 */ 1422 if (t1 == 0) 1423 continue; 1424 1425 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1426 if (!node_memory[k].exists) 1427 continue; 1428 for (l = 0; l < lgrp_plat_node_cnt; l++) { 1429 if (!node_memory[l].exists) 1430 continue; 1431 /* 1432 * Pick another pair of nodes (k, l) 1433 * not same as (i, j) and get latency 1434 * between them 1435 */ 1436 if (k == i && l == j) 1437 continue; 1438 1439 t2 = lat_stats->latencies[k][l]; 1440 1441 /* 1442 * Skip this pair of nodes if there 1443 * isn't a latency for it yet 1444 */ 1445 1446 if (t2 == 0) 1447 continue; 1448 1449 /* 1450 * Skip nodes (k, l) if they already 1451 * have same latency as (i, j) or 1452 * their latency isn't close enough to 1453 * be considered/made the same 1454 */ 1455 if (t1 == t2 || (t1 > t2 && t1 - t2 > 1456 t1 >> lgrp_plat_probe_lt_shift) || 1457 (t2 > t1 && t2 - t1 > 1458 t2 >> lgrp_plat_probe_lt_shift)) 1459 continue; 1460 1461 /* 1462 * Make latency(i, j) same as 1463 * latency(k, l), try to use latency 1464 * that has been adjusted already to get 1465 * more consistency (if possible), and 1466 * remember which latencies were 1467 * adjusted for next time 1468 */ 1469 if (lat_corrected[i][j]) { 1470 t = t1; 1471 lgrp_config(cflag, t2, t); 1472 t2 = t; 1473 } else if (lat_corrected[k][l]) { 1474 t = t2; 1475 lgrp_config(cflag, t1, t); 1476 t1 = t; 1477 } else { 1478 if (t1 > t2) 1479 t = t2; 1480 else 1481 t = t1; 1482 lgrp_config(cflag, t1, t); 1483 lgrp_config(cflag, t2, t); 1484 t1 = t2 = t; 1485 } 1486 1487 lat_stats->latencies[i][j] = 1488 lat_stats->latencies[k][l] = t; 1489 1490 lat_corrected[i][j] = 1491 lat_corrected[k][l] = 1; 1492 } 1493 } 1494 } 1495 } 1496 1497 /* 1498 * Local latencies should be same 1499 * - Find min and max local latencies 1500 * - Make all local latencies be minimum 1501 */ 1502 min = -1; 1503 max = 0; 1504 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1505 if (!node_memory[i].exists) 1506 continue; 1507 t = lat_stats->latencies[i][i]; 1508 if (t == 0) 1509 continue; 1510 if (min == -1 || t < min) 1511 min = t; 1512 if (t > max) 1513 max = t; 1514 } 1515 if (min != max) { 1516 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1517 int local; 1518 1519 if (!node_memory[i].exists) 1520 continue; 1521 1522 local = lat_stats->latencies[i][i]; 1523 if (local == 0) 1524 continue; 1525 1526 /* 1527 * Track suspect probe times that aren't within 1528 * tolerance of minimum local latency and how much 1529 * probe times are corrected by 1530 */ 1531 if (local - min > min >> lgrp_plat_probe_lt_shift) 1532 probe_stats->probe_suspect[i][i]++; 1533 1534 probe_stats->probe_errors[i][i] += local - min; 1535 1536 /* 1537 * Make local latencies be minimum 1538 */ 1539 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1540 lat_stats->latencies[i][i] = min; 1541 } 1542 } 1543 1544 /* 1545 * Determine max probe time again since just adjusted latencies 1546 */ 1547 lat_stats->latency_max = 0; 1548 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1549 if (!node_memory[i].exists) 1550 continue; 1551 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1552 if (!node_memory[j].exists) 1553 continue; 1554 t = lat_stats->latencies[i][j]; 1555 if (t > lat_stats->latency_max) 1556 lat_stats->latency_max = t; 1557 } 1558 } 1559 } 1560 1561 1562 /* 1563 * Verify following about latencies between nodes: 1564 * 1565 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1566 * - Local latencies same 1567 * - Local < remote 1568 * - Number of latencies seen is reasonable 1569 * - Number of occurrences of a given latency should be more than 1 1570 * 1571 * Returns: 1572 * 0 Success 1573 * -1 Not symmetric 1574 * -2 Local latencies not same 1575 * -3 Local >= remote 1576 */ 1577 static int 1578 lgrp_plat_latency_verify(node_phys_addr_map_t *node_memory, 1579 lgrp_plat_latency_stats_t *lat_stats) 1580 { 1581 int i; 1582 int j; 1583 u_longlong_t t1; 1584 u_longlong_t t2; 1585 1586 ASSERT(node_memory != NULL && lat_stats != NULL); 1587 1588 /* 1589 * Nothing to do when this is an UMA machine, lgroup topology is 1590 * limited to 2 levels, or there aren't any probe times yet 1591 */ 1592 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1593 lat_stats->latencies[0][0] == 0) 1594 return (0); 1595 1596 /* 1597 * Make sure that latencies are symmetric between any two nodes 1598 * (ie. latency(node0, node1) == latency(node1, node0)) 1599 */ 1600 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1601 if (!node_memory[i].exists) 1602 continue; 1603 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1604 if (!node_memory[j].exists) 1605 continue; 1606 t1 = lat_stats->latencies[i][j]; 1607 t2 = lat_stats->latencies[j][i]; 1608 1609 if (t1 == 0 || t2 == 0 || t1 == t2) 1610 continue; 1611 1612 return (-1); 1613 } 1614 } 1615 1616 /* 1617 * Local latencies should be same 1618 */ 1619 t1 = lat_stats->latencies[0][0]; 1620 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1621 if (!node_memory[i].exists) 1622 continue; 1623 1624 t2 = lat_stats->latencies[i][i]; 1625 if (t2 == 0) 1626 continue; 1627 1628 if (t1 == 0) { 1629 t1 = t2; 1630 continue; 1631 } 1632 1633 if (t1 != t2) 1634 return (-2); 1635 } 1636 1637 /* 1638 * Local latencies should be less than remote 1639 */ 1640 if (t1) { 1641 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1642 if (!node_memory[i].exists) 1643 continue; 1644 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1645 if (!node_memory[j].exists) 1646 continue; 1647 t2 = lat_stats->latencies[i][j]; 1648 if (i == j || t2 == 0) 1649 continue; 1650 1651 if (t1 >= t2) 1652 return (-3); 1653 } 1654 } 1655 } 1656 1657 return (0); 1658 } 1659 1660 1661 /* 1662 * Return the number of free, allocatable, or installed 1663 * pages in an lgroup 1664 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 1665 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 1666 */ 1667 /* ARGSUSED */ 1668 static pgcnt_t 1669 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 1670 { 1671 struct memlist *mlist; 1672 pgcnt_t npgs = 0; 1673 extern struct memlist *phys_avail; 1674 extern struct memlist *phys_install; 1675 1676 switch (query) { 1677 case LGRP_MEM_SIZE_FREE: 1678 return ((pgcnt_t)freemem); 1679 case LGRP_MEM_SIZE_AVAIL: 1680 memlist_read_lock(); 1681 for (mlist = phys_avail; mlist; mlist = mlist->next) 1682 npgs += btop(mlist->size); 1683 memlist_read_unlock(); 1684 return (npgs); 1685 case LGRP_MEM_SIZE_INSTALL: 1686 memlist_read_lock(); 1687 for (mlist = phys_install; mlist; mlist = mlist->next) 1688 npgs += btop(mlist->size); 1689 memlist_read_unlock(); 1690 return (npgs); 1691 default: 1692 return ((pgcnt_t)0); 1693 } 1694 } 1695 1696 1697 /* 1698 * Update node to proximity domain mappings for given domain and return node ID 1699 */ 1700 static int 1701 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, 1702 uint32_t domain) 1703 { 1704 uint_t node; 1705 uint_t start; 1706 1707 /* 1708 * Hash proximity domain ID into node to domain mapping table (array) 1709 * and add entry for it into first non-existent or matching entry found 1710 */ 1711 node = start = NODE_DOMAIN_HASH(domain, node_cnt); 1712 do { 1713 /* 1714 * Entry doesn't exist yet, so create one for this proximity 1715 * domain and return node ID which is index into mapping table. 1716 */ 1717 if (!node_domain[node].exists) { 1718 node_domain[node].exists = 1; 1719 node_domain[node].prox_domain = domain; 1720 return (node); 1721 } 1722 1723 /* 1724 * Entry exists for this proximity domain already, so just 1725 * return node ID (index into table). 1726 */ 1727 if (node_domain[node].prox_domain == domain) 1728 return (node); 1729 node = NODE_DOMAIN_HASH(node + 1, node_cnt); 1730 } while (node != start); 1731 1732 /* 1733 * Ran out of supported number of entries which shouldn't happen.... 1734 */ 1735 ASSERT(node != start); 1736 return (-1); 1737 } 1738 1739 1740 /* 1741 * Update node memory information for given proximity domain with specified 1742 * starting and ending physical address range (and return positive numbers for 1743 * success and negative ones for errors) 1744 */ 1745 static int 1746 lgrp_plat_node_memory_update(node_domain_map_t *node_domain, int node_cnt, 1747 node_phys_addr_map_t *node_memory, uint64_t start, uint64_t end, 1748 uint32_t domain) 1749 { 1750 int node; 1751 1752 /* 1753 * Get node number for proximity domain 1754 */ 1755 node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 1756 if (node == -1) { 1757 node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1758 domain); 1759 if (node == -1) 1760 return (-1); 1761 } 1762 1763 /* 1764 * Create entry in table for node if it doesn't exist 1765 */ 1766 if (!node_memory[node].exists) { 1767 node_memory[node].exists = 1; 1768 node_memory[node].start = btop(start); 1769 node_memory[node].end = btop(end); 1770 node_memory[node].prox_domain = domain; 1771 return (0); 1772 } 1773 1774 /* 1775 * Entry already exists for this proximity domain 1776 * 1777 * There may be more than one SRAT memory entry for a domain, so we may 1778 * need to update existing start or end address for the node. 1779 */ 1780 if (node_memory[node].prox_domain == domain) { 1781 if (btop(start) < node_memory[node].start) 1782 node_memory[node].start = btop(start); 1783 if (btop(end) > node_memory[node].end) 1784 node_memory[node].end = btop(end); 1785 return (1); 1786 } 1787 return (-2); 1788 } 1789 1790 1791 /* 1792 * Return time needed to probe from current CPU to memory in given node 1793 */ 1794 static hrtime_t 1795 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 1796 lgrp_plat_probe_mem_config_t *probe_mem_config, 1797 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1798 { 1799 caddr_t buf; 1800 hrtime_t elapsed; 1801 hrtime_t end; 1802 int from; 1803 int i; 1804 int ipl; 1805 hrtime_t max; 1806 hrtime_t min; 1807 hrtime_t start; 1808 extern int use_sse_pagecopy; 1809 1810 /* 1811 * Determine ID of node containing current CPU 1812 */ 1813 from = lgrp_plat_cpu_to_node(CPU, cpu_node); 1814 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1815 1816 /* 1817 * Do common work for probing main memory 1818 */ 1819 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 1820 /* 1821 * Skip probing any nodes without memory and 1822 * set probe time to 0 1823 */ 1824 if (probe_mem_config->probe_va[to] == NULL) { 1825 lat_stats->latencies[from][to] = 0; 1826 return (0); 1827 } 1828 1829 /* 1830 * Invalidate caches once instead of once every sample 1831 * which should cut cost of probing by a lot 1832 */ 1833 probe_stats->flush_cost = gethrtime(); 1834 invalidate_cache(); 1835 probe_stats->flush_cost = gethrtime() - 1836 probe_stats->flush_cost; 1837 probe_stats->probe_cost_total += probe_stats->flush_cost; 1838 } 1839 1840 /* 1841 * Probe from current CPU to given memory using specified operation 1842 * and take specified number of samples 1843 */ 1844 max = 0; 1845 min = -1; 1846 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 1847 probe_stats->probe_cost = gethrtime(); 1848 1849 /* 1850 * Can't measure probe time if gethrtime() isn't working yet 1851 */ 1852 if (probe_stats->probe_cost == 0 && gethrtime() == 0) 1853 return (0); 1854 1855 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 1856 /* 1857 * Measure how long it takes to read vendor ID from 1858 * Northbridge 1859 */ 1860 elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 1861 } else { 1862 /* 1863 * Measure how long it takes to copy page 1864 * on top of itself 1865 */ 1866 buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 1867 1868 kpreempt_disable(); 1869 ipl = splhigh(); 1870 start = gethrtime(); 1871 if (use_sse_pagecopy) 1872 hwblkpagecopy(buf, buf); 1873 else 1874 bcopy(buf, buf, PAGESIZE); 1875 end = gethrtime(); 1876 elapsed = end - start; 1877 splx(ipl); 1878 kpreempt_enable(); 1879 } 1880 1881 probe_stats->probe_cost = gethrtime() - 1882 probe_stats->probe_cost; 1883 probe_stats->probe_cost_total += probe_stats->probe_cost; 1884 1885 if (min == -1 || elapsed < min) 1886 min = elapsed; 1887 if (elapsed > max) 1888 max = elapsed; 1889 } 1890 1891 /* 1892 * Update minimum and maximum probe times between 1893 * these two nodes 1894 */ 1895 if (min < probe_stats->probe_min[from][to] || 1896 probe_stats->probe_min[from][to] == 0) 1897 probe_stats->probe_min[from][to] = min; 1898 1899 if (max > probe_stats->probe_max[from][to]) 1900 probe_stats->probe_max[from][to] = max; 1901 1902 return (min); 1903 } 1904 1905 1906 /* 1907 * Read boot property with CPU to APIC ID array, fill in CPU to node ID 1908 * mapping table with APIC ID for each CPU, and return number of CPU APIC IDs. 1909 * 1910 * NOTE: This code assumes that CPU IDs are assigned in order that they appear 1911 * in in cpu_apicid_array boot property which is based on and follows 1912 * same ordering as processor list in ACPI MADT. If the code in 1913 * usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns 1914 * CPU IDs ever changes, then this code will need to change too.... 1915 */ 1916 static int 1917 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) 1918 { 1919 int boot_prop_len; 1920 char *boot_prop_name = BP_CPU_APICID_ARRAY; 1921 uint8_t cpu_apicid_array[UINT8_MAX + 1]; 1922 int i; 1923 int n; 1924 1925 /* 1926 * Nothing to do when no array to fill in or not enough CPUs 1927 */ 1928 if (cpu_node == NULL) 1929 return (-1); 1930 1931 /* 1932 * Check length of property value 1933 */ 1934 boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); 1935 if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array)) 1936 return (-2); 1937 1938 /* 1939 * Calculate number of entries in array and return when there's just 1940 * one CPU since that's not very interesting for NUMA 1941 */ 1942 n = boot_prop_len / sizeof (uint8_t); 1943 if (n == 1) 1944 return (-3); 1945 1946 /* 1947 * Get CPU to APIC ID property value 1948 */ 1949 if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) 1950 return (-4); 1951 1952 /* 1953 * Fill in CPU to node ID mapping table with APIC ID for each CPU 1954 */ 1955 for (i = 0; i < n; i++) { 1956 cpu_node[i].exists = 1; 1957 cpu_node[i].apicid = cpu_apicid_array[i]; 1958 } 1959 1960 /* 1961 * Return number of CPUs based on number of APIC IDs 1962 */ 1963 return (n); 1964 } 1965 1966 1967 /* 1968 * Read ACPI System Locality Information Table (SLIT) to determine how far each 1969 * NUMA node is from each other 1970 */ 1971 static int 1972 lgrp_plat_process_slit(struct slit *tp, uint_t node_cnt, 1973 node_phys_addr_map_t *node_memory, lgrp_plat_latency_stats_t *lat_stats) 1974 { 1975 int i; 1976 int j; 1977 int localities; 1978 hrtime_t max; 1979 hrtime_t min; 1980 int retval; 1981 uint8_t *slit_entries; 1982 1983 if (tp == NULL || !lgrp_plat_slit_enable) 1984 return (1); 1985 1986 if (lat_stats == NULL) 1987 return (2); 1988 1989 localities = tp->number; 1990 if (localities != node_cnt) 1991 return (3); 1992 1993 min = lat_stats->latency_min; 1994 max = lat_stats->latency_max; 1995 1996 /* 1997 * Fill in latency matrix based on SLIT entries 1998 */ 1999 slit_entries = tp->entry; 2000 for (i = 0; i < localities; i++) { 2001 for (j = 0; j < localities; j++) { 2002 uint8_t latency; 2003 2004 latency = slit_entries[(i * localities) + j]; 2005 lat_stats->latencies[i][j] = latency; 2006 if (latency < min || min == -1) 2007 min = latency; 2008 if (latency > max) 2009 max = latency; 2010 } 2011 } 2012 2013 /* 2014 * Verify that latencies/distances given in SLIT look reasonable 2015 */ 2016 retval = lgrp_plat_latency_verify(node_memory, lat_stats); 2017 2018 if (retval) { 2019 /* 2020 * Reinitialize (zero) latency table since SLIT doesn't look 2021 * right 2022 */ 2023 for (i = 0; i < localities; i++) { 2024 for (j = 0; j < localities; j++) 2025 lat_stats->latencies[i][j] = 0; 2026 } 2027 } else { 2028 /* 2029 * Update min and max latencies seen since SLIT looks valid 2030 */ 2031 lat_stats->latency_min = min; 2032 lat_stats->latency_max = max; 2033 } 2034 2035 return (retval); 2036 } 2037 2038 2039 /* 2040 * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 2041 * and memory are local to each other in the same NUMA node and return number 2042 * of nodes 2043 */ 2044 static int 2045 lgrp_plat_process_srat(struct srat *tp, node_domain_map_t *node_domain, 2046 cpu_node_map_t *cpu_node, int cpu_count, node_phys_addr_map_t *node_memory) 2047 { 2048 struct srat_item *srat_end; 2049 int i; 2050 struct srat_item *item; 2051 int node_cnt; 2052 int proc_entry_count; 2053 2054 /* 2055 * Nothing to do when no SRAT or disabled 2056 */ 2057 if (tp == NULL || !lgrp_plat_srat_enable) 2058 return (-1); 2059 2060 /* 2061 * Determine number of nodes by counting number of proximity domains in 2062 * SRAT and return if number of nodes is 1 or less since don't need to 2063 * read SRAT then 2064 */ 2065 node_cnt = lgrp_plat_srat_domains(tp); 2066 if (node_cnt == 1) 2067 return (1); 2068 else if (node_cnt <= 0) 2069 return (-2); 2070 2071 /* 2072 * Walk through SRAT, examining each CPU and memory entry to determine 2073 * which CPUs and memory belong to which node. 2074 */ 2075 item = tp->list; 2076 srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2077 proc_entry_count = 0; 2078 while (item < srat_end) { 2079 uint32_t apic_id; 2080 uint32_t domain; 2081 uint64_t end; 2082 uint64_t length; 2083 uint64_t start; 2084 2085 switch (item->type) { 2086 case SRAT_PROCESSOR: /* CPU entry */ 2087 if (!(item->i.p.flags & SRAT_ENABLED) || 2088 cpu_node == NULL) 2089 break; 2090 2091 /* 2092 * Calculate domain (node) ID and fill in APIC ID to 2093 * domain/node mapping table 2094 */ 2095 domain = item->i.p.domain1; 2096 for (i = 0; i < 3; i++) { 2097 domain += item->i.p.domain2[i] << 2098 ((i + 1) * 8); 2099 } 2100 apic_id = item->i.p.apic_id; 2101 2102 if (lgrp_plat_cpu_node_update(node_domain, node_cnt, 2103 cpu_node, cpu_count, apic_id, domain) < 0) 2104 return (-3); 2105 2106 proc_entry_count++; 2107 break; 2108 2109 case SRAT_MEMORY: /* memory entry */ 2110 if (!(item->i.m.flags & SRAT_ENABLED) || 2111 node_memory == NULL) 2112 break; 2113 2114 /* 2115 * Get domain (node) ID and fill in domain/node 2116 * to memory mapping table 2117 */ 2118 domain = item->i.m.domain; 2119 start = item->i.m.base_addr; 2120 length = item->i.m.len; 2121 end = start + length - 1; 2122 2123 if (lgrp_plat_node_memory_update(node_domain, node_cnt, 2124 node_memory, start, end, domain) < 0) 2125 return (-4); 2126 break; 2127 2128 default: 2129 break; 2130 } 2131 2132 item = (struct srat_item *)((uintptr_t)item + item->len); 2133 } 2134 2135 /* 2136 * Should have seen at least as many SRAT processor entries as CPUs 2137 */ 2138 if (proc_entry_count < cpu_count) 2139 return (-5); 2140 2141 return (node_cnt); 2142 } 2143 2144 2145 /* 2146 * Return number of proximity domains given in ACPI SRAT 2147 */ 2148 static int 2149 lgrp_plat_srat_domains(struct srat *tp) 2150 { 2151 int domain_cnt; 2152 struct srat_item *end; 2153 int i; 2154 struct srat_item *item; 2155 node_domain_map_t node_domain[MAX_NODES]; 2156 2157 2158 if (tp == NULL || !lgrp_plat_srat_enable) 2159 return (1); 2160 2161 /* 2162 * Walk through SRAT, examining each CPU and memory entry to determine 2163 * proximity domain ID for each. 2164 */ 2165 domain_cnt = 0; 2166 item = tp->list; 2167 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2168 bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 2169 while (item < end) { 2170 uint32_t domain; 2171 boolean_t overflow; 2172 uint_t start; 2173 2174 switch (item->type) { 2175 case SRAT_PROCESSOR: /* CPU entry */ 2176 if (!(item->i.p.flags & SRAT_ENABLED)) 2177 break; 2178 domain = item->i.p.domain1; 2179 for (i = 0; i < 3; i++) { 2180 domain += item->i.p.domain2[i] << 2181 ((i + 1) * 8); 2182 } 2183 break; 2184 2185 case SRAT_MEMORY: /* memory entry */ 2186 if (!(item->i.m.flags & SRAT_ENABLED)) 2187 break; 2188 domain = item->i.m.domain; 2189 break; 2190 2191 default: 2192 break; 2193 } 2194 2195 /* 2196 * Count and keep track of which proximity domain IDs seen 2197 */ 2198 start = i = domain % MAX_NODES; 2199 overflow = B_TRUE; 2200 do { 2201 /* 2202 * Create entry for proximity domain and increment 2203 * count when no entry exists where proximity domain 2204 * hashed 2205 */ 2206 if (!node_domain[i].exists) { 2207 node_domain[i].exists = 1; 2208 node_domain[i].prox_domain = domain; 2209 domain_cnt++; 2210 overflow = B_FALSE; 2211 break; 2212 } 2213 2214 /* 2215 * Nothing to do when proximity domain seen already 2216 * and its entry exists 2217 */ 2218 if (node_domain[i].prox_domain == domain) { 2219 overflow = B_FALSE; 2220 break; 2221 } 2222 2223 /* 2224 * Entry exists where proximity domain hashed, but for 2225 * different proximity domain so keep search for empty 2226 * slot to put it or matching entry whichever comes 2227 * first. 2228 */ 2229 i = (i + 1) % MAX_NODES; 2230 } while (i != start); 2231 2232 /* 2233 * Didn't find empty or matching entry which means have more 2234 * proximity domains than supported nodes (:-( 2235 */ 2236 ASSERT(overflow != B_TRUE); 2237 if (overflow == B_TRUE) 2238 return (-1); 2239 2240 item = (struct srat_item *)((uintptr_t)item + item->len); 2241 } 2242 return (domain_cnt); 2243 } 2244 2245 2246 /* 2247 * Set lgroup latencies for 2 level lgroup topology 2248 */ 2249 static void 2250 lgrp_plat_2level_setup(node_phys_addr_map_t *node_memory, 2251 lgrp_plat_latency_stats_t *lat_stats) 2252 { 2253 int i; 2254 2255 ASSERT(node_memory != NULL && lat_stats != NULL); 2256 2257 if (lgrp_plat_node_cnt >= 4) 2258 cmn_err(CE_NOTE, 2259 "MPO only optimizing for local and remote\n"); 2260 for (i = 0; i < lgrp_plat_node_cnt; i++) { 2261 int j; 2262 2263 if (!node_memory[i].exists) 2264 continue; 2265 for (j = 0; j < lgrp_plat_node_cnt; j++) { 2266 if (!node_memory[j].exists) 2267 continue; 2268 if (i == j) 2269 lat_stats->latencies[i][j] = 2; 2270 else 2271 lat_stats->latencies[i][j] = 3; 2272 } 2273 } 2274 lat_stats->latency_min = 2; 2275 lat_stats->latency_max = 3; 2276 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 2277 } 2278 2279 2280 /* 2281 * The following Opteron specific constants, macros, types, and routines define 2282 * PCI configuration space registers and how to read them to determine the NUMA 2283 * configuration of *supported* Opteron processors. They provide the same 2284 * information that may be gotten from the ACPI System Resource Affinity Table 2285 * (SRAT) if it exists on the machine of interest. 2286 * 2287 * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 2288 * of interest describes all of these registers and their contents. The main 2289 * registers used by this code to determine the NUMA configuration of the 2290 * machine are the node ID register for the number of NUMA nodes and the DRAM 2291 * address map registers for the physical address range of each node. 2292 * 2293 * NOTE: The format and how to determine the NUMA configuration using PCI 2294 * config space registers may change or may not be supported in future 2295 * Opteron processor families. 2296 */ 2297 2298 /* 2299 * How many bits to shift Opteron DRAM Address Map base and limit registers 2300 * to get actual value 2301 */ 2302 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 2303 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 2304 2305 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 2306 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 2307 2308 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 2309 2310 /* 2311 * Macros to derive addresses from Opteron DRAM Address Map registers 2312 */ 2313 #define OPT_DRAMADDR_HI(reg) \ 2314 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 2315 OPT_DRAMADDR_HI_LSHIFT_ADDR) 2316 2317 #define OPT_DRAMADDR_LO(reg) \ 2318 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 2319 OPT_DRAMADDR_LO_LSHIFT_ADDR) 2320 2321 #define OPT_DRAMADDR(high, low) \ 2322 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 2323 2324 /* 2325 * Bit masks defining what's in Opteron DRAM Address Map base register 2326 */ 2327 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 2328 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 2329 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 2330 2331 /* 2332 * Bit masks defining what's in Opteron DRAM Address Map limit register 2333 */ 2334 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 2335 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 2336 2337 2338 /* 2339 * Opteron Node ID register in PCI configuration space contains 2340 * number of nodes in system, etc. for Opteron K8. The following 2341 * constants and macros define its contents, structure, and access. 2342 */ 2343 2344 /* 2345 * Bit masks defining what's in Opteron Node ID register 2346 */ 2347 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 2348 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 2349 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 2350 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 2351 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 2352 2353 /* 2354 * How many bits in Opteron Node ID register to shift right to get actual value 2355 */ 2356 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 2357 2358 /* 2359 * Macros to get values from Opteron Node ID register 2360 */ 2361 #define OPT_NODE_CNT(reg) \ 2362 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 2363 2364 /* 2365 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 2366 * "in/out" instructions 2367 * 2368 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 2369 * other uses should just do MMIO to access PCI ECS. 2370 * Must enable special bit in Northbridge Configuration Register on 2371 * Greyhound for extended CF8 space access to be able to access PCI ECS 2372 * using "in/out" instructions and restore special bit after done 2373 * accessing PCI ECS. 2374 */ 2375 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 2376 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 2377 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 2378 ((((reg) >> 8) & 0xf) << 24)) 2379 2380 /* 2381 * PCI configuration space registers accessed by specifying 2382 * a bus, device, function, and offset. The following constants 2383 * define the values needed to access Opteron K8 configuration 2384 * info to determine its node topology 2385 */ 2386 2387 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 2388 2389 /* 2390 * Opteron PCI configuration space register function values 2391 */ 2392 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 2393 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 2394 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 2395 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 2396 2397 /* 2398 * PCI Configuration Space register offsets 2399 */ 2400 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 2401 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 2402 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 2403 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 2404 2405 /* 2406 * Opteron PCI Configuration Space device IDs for nodes 2407 */ 2408 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 2409 2410 2411 /* 2412 * Opteron DRAM address map gives base and limit for physical memory in a node 2413 */ 2414 typedef struct opt_dram_addr_map { 2415 uint32_t base_hi; 2416 uint32_t base_lo; 2417 uint32_t limit_hi; 2418 uint32_t limit_lo; 2419 } opt_dram_addr_map_t; 2420 2421 2422 /* 2423 * Supported AMD processor families 2424 */ 2425 #define AMD_FAMILY_HAMMER 15 2426 #define AMD_FAMILY_GREYHOUND 16 2427 2428 /* 2429 * Whether to have is_opteron() return 1 even when processor isn't supported 2430 */ 2431 uint_t is_opteron_override = 0; 2432 2433 /* 2434 * AMD processor family for current CPU 2435 */ 2436 uint_t opt_family = 0; 2437 2438 2439 /* 2440 * Determine whether we're running on a supported AMD Opteron since reading 2441 * node count and DRAM address map registers may have different format or 2442 * may not be supported across processor families 2443 */ 2444 static int 2445 is_opteron(void) 2446 { 2447 2448 if (x86_vendor != X86_VENDOR_AMD) 2449 return (0); 2450 2451 opt_family = cpuid_getfamily(CPU); 2452 if (opt_family == AMD_FAMILY_HAMMER || 2453 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 2454 return (1); 2455 else 2456 return (0); 2457 } 2458 2459 2460 /* 2461 * Determine NUMA configuration for Opteron from registers that live in PCI 2462 * configuration space 2463 */ 2464 static void 2465 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 2466 node_phys_addr_map_t *node_memory) 2467 { 2468 uint_t bus; 2469 uint_t dev; 2470 struct opt_dram_addr_map dram_map[MAX_NODES]; 2471 uint_t node; 2472 uint_t node_info[MAX_NODES]; 2473 uint_t off_hi; 2474 uint_t off_lo; 2475 uint64_t nb_cfg_reg; 2476 2477 /* 2478 * Read configuration registers from PCI configuration space to 2479 * determine node information, which memory is in each node, etc. 2480 * 2481 * Write to PCI configuration space address register to specify 2482 * which configuration register to read and read/write PCI 2483 * configuration space data register to get/set contents 2484 */ 2485 bus = OPT_PCS_BUS_CONFIG; 2486 dev = OPT_PCS_DEV_NODE0; 2487 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 2488 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 2489 2490 /* 2491 * Read node ID register for node 0 to get node count 2492 */ 2493 node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 2494 OPT_PCS_OFF_NODEID); 2495 *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 2496 2497 /* 2498 * If number of nodes is more than maximum supported, then set node 2499 * count to 1 and treat system as UMA instead of NUMA. 2500 */ 2501 if (*node_cnt > MAX_NODES) { 2502 *node_cnt = 1; 2503 return; 2504 } 2505 2506 /* 2507 * For Greyhound, PCI Extended Configuration Space must be enabled to 2508 * read high DRAM address map base and limit registers 2509 */ 2510 if (opt_family == AMD_FAMILY_GREYHOUND) { 2511 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 2512 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2513 wrmsr(MSR_AMD_NB_CFG, 2514 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 2515 } 2516 2517 for (node = 0; node < *node_cnt; node++) { 2518 uint32_t base_hi; 2519 uint32_t base_lo; 2520 uint32_t limit_hi; 2521 uint32_t limit_lo; 2522 2523 /* 2524 * Read node ID register (except for node 0 which we just read) 2525 */ 2526 if (node > 0) { 2527 node_info[node] = pci_getl_func(bus, dev, 2528 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 2529 } 2530 2531 /* 2532 * Read DRAM base and limit registers which specify 2533 * physical memory range of each node 2534 */ 2535 if (opt_family != AMD_FAMILY_GREYHOUND) 2536 base_hi = 0; 2537 else { 2538 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2539 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2540 base_hi = dram_map[node].base_hi = 2541 inl(PCI_CONFDATA); 2542 } 2543 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 2544 OPT_PCS_FUNC_ADDRMAP, off_lo); 2545 2546 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 2547 mem_intrlv) 2548 *mem_intrlv = *mem_intrlv + 1; 2549 2550 off_hi += 4; /* high limit register offset */ 2551 if (opt_family != AMD_FAMILY_GREYHOUND) 2552 limit_hi = 0; 2553 else { 2554 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 2555 OPT_PCS_FUNC_ADDRMAP, off_hi)); 2556 limit_hi = dram_map[node].limit_hi = 2557 inl(PCI_CONFDATA); 2558 } 2559 2560 off_lo += 4; /* low limit register offset */ 2561 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 2562 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 2563 2564 /* 2565 * Increment device number to next node and register offsets 2566 * for DRAM base register of next node 2567 */ 2568 off_hi += 4; 2569 off_lo += 4; 2570 dev++; 2571 2572 /* 2573 * Both read and write enable bits must be enabled in DRAM 2574 * address map base register for physical memory to exist in 2575 * node 2576 */ 2577 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 2578 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 2579 /* 2580 * Mark node memory as non-existent and set start and 2581 * end addresses to be same in node_memory[] 2582 */ 2583 node_memory[node].exists = 0; 2584 node_memory[node].start = node_memory[node].end = 2585 (pfn_t)-1; 2586 continue; 2587 } 2588 2589 /* 2590 * Mark node memory as existing and remember physical address 2591 * range of each node for use later 2592 */ 2593 node_memory[node].exists = 1; 2594 2595 node_memory[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 2596 2597 node_memory[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 2598 OPT_DRAMADDR_LO_MASK_OFF); 2599 } 2600 2601 /* 2602 * Restore PCI Extended Configuration Space enable bit 2603 */ 2604 if (opt_family == AMD_FAMILY_GREYHOUND) { 2605 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 2606 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 2607 } 2608 } 2609 2610 2611 /* 2612 * Return average amount of time to read vendor ID register on Northbridge 2613 * N times on specified destination node from current CPU 2614 */ 2615 static hrtime_t 2616 opt_probe_vendor(int dest_node, int nreads) 2617 { 2618 int cnt; 2619 uint_t dev; 2620 /* LINTED: set but not used in function */ 2621 volatile uint_t dev_vendor; 2622 hrtime_t elapsed; 2623 hrtime_t end; 2624 int ipl; 2625 hrtime_t start; 2626 2627 dev = OPT_PCS_DEV_NODE0 + dest_node; 2628 kpreempt_disable(); 2629 ipl = spl8(); 2630 outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 2631 OPT_PCS_OFF_VENDOR)); 2632 start = gethrtime(); 2633 for (cnt = 0; cnt < nreads; cnt++) 2634 dev_vendor = inl(PCI_CONFDATA); 2635 end = gethrtime(); 2636 elapsed = (end - start) / nreads; 2637 splx(ipl); 2638 kpreempt_enable(); 2639 return (elapsed); 2640 } 2641