1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 24 */ 25 /* 26 * Copyright (c) 2010, Intel Corporation. 27 * All rights reserved. 28 */ 29 30 /* 31 * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 32 * ================================================================ 33 * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 34 * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 35 * one or more CPUs and some local memory. The CPUs in each node can access 36 * the memory in the other nodes but at a higher latency than accessing their 37 * local memory. Typically, a system with only one node has Uniform Memory 38 * Access (UMA), but it may be possible to have a one node system that has 39 * some global memory outside of the node which is higher latency. 40 * 41 * Module Description 42 * ------------------ 43 * This module provides a platform interface for determining which CPUs and 44 * which memory (and how much) are in a NUMA node and how far each node is from 45 * each other. The interface is used by the Virtual Memory (VM) system and the 46 * common lgroup framework. The VM system uses the plat_*() routines to fill 47 * in its memory node (memnode) array with the physical address range spanned 48 * by each NUMA node to know which memory belongs to which node, so it can 49 * build and manage a physical page free list for each NUMA node and allocate 50 * local memory from each node as needed. The common lgroup framework uses the 51 * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 52 * to each node (leaf lgroup) and how far each node is from each other, so it 53 * can build the latency (lgroup) topology for the machine in order to optimize 54 * for locality. Also, an lgroup platform handle instead of lgroups are used 55 * in the interface with this module, so this module shouldn't need to know 56 * anything about lgroups. Instead, it just needs to know which CPUs, memory, 57 * etc. are in each NUMA node, how far each node is from each other, and to use 58 * a unique lgroup platform handle to refer to each node through the interface. 59 * 60 * Determining NUMA Configuration 61 * ------------------------------ 62 * By default, this module will try to determine the NUMA configuration of the 63 * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 64 * Locality Information Table (SLIT). The SRAT contains info to tell which 65 * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 66 * is a matrix that gives the distance between each system locality (which is 67 * a NUMA node and should correspond to proximity domains in the SRAT). For 68 * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 69 * specification. 70 * 71 * If the SRAT doesn't exist on a system with AMD Opteron processors, we 72 * examine registers in PCI configuration space to determine how many nodes are 73 * in the system and which CPUs and memory are in each node. 74 * do while booting the kernel. 75 * 76 * NOTE: Using these PCI configuration space registers to determine this 77 * locality info is not guaranteed to work or be compatible across all 78 * Opteron processor families. 79 * 80 * If the SLIT does not exist or look right, the kernel will probe to determine 81 * the distance between nodes as long as the NUMA CPU and memory configuration 82 * has been determined (see lgrp_plat_probe() for details). 83 * 84 * Data Structures 85 * --------------- 86 * The main data structures used by this code are the following: 87 * 88 * - lgrp_plat_cpu_node[] CPU to node ID mapping table indexed by 89 * CPU ID (only used for SRAT) 90 * 91 * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 92 * different nodes indexed by node ID 93 * 94 * - lgrp_plat_node_cnt Number of NUMA nodes in system for 95 * non-DR-capable systems, 96 * maximum possible number of NUMA nodes 97 * in system for DR capable systems. 98 * 99 * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 100 * table indexed by node ID (only used 101 * for SRAT) 102 * 103 * - lgrp_plat_memnode_info[] Table with physical address range for 104 * each memory node indexed by memory node 105 * ID 106 * 107 * The code is implemented to make the following always be true: 108 * 109 * lgroup platform handle == node ID == memnode ID 110 * 111 * Moreover, it allows for the proximity domain ID to be equal to all of the 112 * above as long as the proximity domains IDs are numbered from 0 to <number of 113 * nodes - 1>. This is done by hashing each proximity domain ID into the range 114 * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 115 * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 116 * and be assigned node ID N. If the proximity domain IDs aren't numbered 117 * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 118 * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 119 * to node IDs. However, the proximity domain IDs may not map to the 120 * equivalent node ID since we want to keep the node IDs numbered from 0 to 121 * <number of nodes - 1> to minimize cost of searching and potentially space. 122 * 123 * With the introduction of support of memory DR operations on x86 platforms, 124 * things get a little complicated. The addresses of hot-added memory may not 125 * be continuous with other memory connected to the same lgrp node. In other 126 * words, memory addresses may get interleaved among lgrp nodes after memory 127 * DR operations. To work around this limitation, we have extended the 128 * relationship between lgrp node and memory node from 1:1 map to 1:N map, 129 * that means there may be multiple memory nodes associated with a lgrp node 130 * after memory DR operations. 131 * 132 * To minimize the code changes to support memory DR operations, the 133 * following policies have been adopted. 134 * 1) On non-DR-capable systems, the relationship among lgroup platform handle, 135 * node ID and memnode ID is still kept as: 136 * lgroup platform handle == node ID == memnode ID 137 * 2) For memory present at boot time on DR capable platforms, the relationship 138 * is still kept as is. 139 * lgroup platform handle == node ID == memnode ID 140 * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have 141 * been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt) 142 * are reserved for memory present at boot time, and memnode IDs 143 * [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate 144 * memnode ID for hot-added memory. 145 * 4) All boot code having the assumption "node ID == memnode ID" can live as 146 * is, that's because node ID is always equal to memnode ID at boot time. 147 * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and 148 * lgrp_plat_mem_size() related logics have been enhanced to deal with 149 * the 1:N map relationship. 150 * 6) The latency probing related logics, which have the assumption 151 * "node ID == memnode ID" and may be called at run time, is disabled if 152 * memory DR operation is enabled. 153 */ 154 155 156 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 157 #include <sys/atomic.h> 158 #include <sys/bootconf.h> 159 #include <sys/cmn_err.h> 160 #include <sys/controlregs.h> 161 #include <sys/cpupart.h> 162 #include <sys/cpuvar.h> 163 #include <sys/lgrp.h> 164 #include <sys/machsystm.h> 165 #include <sys/memlist.h> 166 #include <sys/memnode.h> 167 #include <sys/mman.h> 168 #include <sys/note.h> 169 #include <sys/pci_cfgspace.h> 170 #include <sys/pci_impl.h> 171 #include <sys/param.h> 172 #include <sys/pghw.h> 173 #include <sys/promif.h> /* for prom_printf() */ 174 #include <sys/sysmacros.h> 175 #include <sys/systm.h> 176 #include <sys/thread.h> 177 #include <sys/types.h> 178 #include <sys/var.h> 179 #include <sys/x86_archext.h> 180 #include <vm/hat_i86.h> 181 #include <vm/seg_kmem.h> 182 #include <vm/vm_dep.h> 183 184 #include <sys/acpidev.h> 185 #include "acpi_fw.h" /* for SRAT, SLIT and MSCT */ 186 187 188 #define MAX_NODES 8 189 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 190 191 /* 192 * Constants for configuring probing 193 */ 194 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 195 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 196 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 197 198 /* 199 * Flags for probing 200 */ 201 #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 202 #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 203 #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 204 205 /* 206 * Hash proximity domain ID into node to domain mapping table "mod" number of 207 * nodes to minimize span of entries used and try to have lowest numbered 208 * proximity domain be node 0 209 */ 210 #define NODE_DOMAIN_HASH(domain, node_cnt) \ 211 ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \ 212 ((domain) - lgrp_plat_prox_domain_min) % node_cnt) 213 214 /* 215 * CPU to node ID mapping structure (only used with SRAT) 216 */ 217 typedef struct cpu_node_map { 218 int exists; 219 uint_t node; 220 uint32_t apicid; 221 uint32_t prox_domain; 222 } cpu_node_map_t; 223 224 /* 225 * Latency statistics 226 */ 227 typedef struct lgrp_plat_latency_stats { 228 hrtime_t latencies[MAX_NODES][MAX_NODES]; 229 hrtime_t latency_max; 230 hrtime_t latency_min; 231 } lgrp_plat_latency_stats_t; 232 233 /* 234 * Memory configuration for probing 235 */ 236 typedef struct lgrp_plat_probe_mem_config { 237 size_t probe_memsize; /* how much memory to probe per node */ 238 caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 239 pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 240 } lgrp_plat_probe_mem_config_t; 241 242 /* 243 * Statistics kept for probing 244 */ 245 typedef struct lgrp_plat_probe_stats { 246 hrtime_t flush_cost; 247 hrtime_t probe_cost; 248 hrtime_t probe_cost_total; 249 hrtime_t probe_error_code; 250 hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 251 int probe_suspect[MAX_NODES][MAX_NODES]; 252 hrtime_t probe_max[MAX_NODES][MAX_NODES]; 253 hrtime_t probe_min[MAX_NODES][MAX_NODES]; 254 } lgrp_plat_probe_stats_t; 255 256 /* 257 * Node to proximity domain ID mapping structure (only used with SRAT) 258 */ 259 typedef struct node_domain_map { 260 int exists; 261 uint32_t prox_domain; 262 } node_domain_map_t; 263 264 /* 265 * Node ID and starting and ending page for physical memory in memory node 266 */ 267 typedef struct memnode_phys_addr_map { 268 pfn_t start; 269 pfn_t end; 270 int exists; 271 uint32_t prox_domain; 272 uint32_t device_id; 273 uint_t lgrphand; 274 } memnode_phys_addr_map_t; 275 276 /* 277 * Number of CPUs for which we got APIC IDs 278 */ 279 static int lgrp_plat_apic_ncpus = 0; 280 281 /* 282 * CPU to node ID mapping table (only used for SRAT) and its max number of 283 * entries 284 */ 285 static cpu_node_map_t *lgrp_plat_cpu_node = NULL; 286 static uint_t lgrp_plat_cpu_node_nentries = 0; 287 288 /* 289 * Latency statistics 290 */ 291 lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 292 293 /* 294 * Whether memory is interleaved across nodes causing MPO to be disabled 295 */ 296 static int lgrp_plat_mem_intrlv = 0; 297 298 /* 299 * Node ID to proximity domain ID mapping table (only used for SRAT) 300 */ 301 static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 302 303 /* 304 * Physical address range for memory in each node 305 */ 306 static memnode_phys_addr_map_t lgrp_plat_memnode_info[MAX_MEM_NODES]; 307 308 /* 309 * Statistics gotten from probing 310 */ 311 static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 312 313 /* 314 * Memory configuration for probing 315 */ 316 static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 317 318 /* 319 * Lowest proximity domain ID seen in ACPI SRAT 320 */ 321 static uint32_t lgrp_plat_prox_domain_min = UINT32_MAX; 322 323 /* 324 * Error code from processing ACPI SRAT 325 */ 326 static int lgrp_plat_srat_error = 0; 327 328 /* 329 * Error code from processing ACPI SLIT 330 */ 331 static int lgrp_plat_slit_error = 0; 332 333 /* 334 * Whether lgrp topology has been flattened to 2 levels. 335 */ 336 static int lgrp_plat_topo_flatten = 0; 337 338 339 /* 340 * Maximum memory node ID in use. 341 */ 342 static uint_t lgrp_plat_max_mem_node; 343 344 /* 345 * Allocate lgroup array statically 346 */ 347 static lgrp_t lgrp_space[NLGRP]; 348 static int nlgrps_alloc; 349 350 351 /* 352 * Enable finding and using minimum proximity domain ID when hashing 353 */ 354 int lgrp_plat_domain_min_enable = 1; 355 356 /* 357 * Maximum possible number of nodes in system 358 */ 359 uint_t lgrp_plat_node_cnt = 1; 360 361 /* 362 * Enable sorting nodes in ascending order by starting physical address 363 */ 364 int lgrp_plat_node_sort_enable = 1; 365 366 /* 367 * Configuration Parameters for Probing 368 * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 369 * operation, etc. 370 * - lgrp_plat_probe_nrounds How many rounds of probing to do 371 * - lgrp_plat_probe_nsamples Number of samples to take when probing each 372 * node 373 * - lgrp_plat_probe_nreads Number of times to read vendor ID from 374 * Northbridge for each probe 375 */ 376 uint_t lgrp_plat_probe_flags = 0; 377 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 378 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 379 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 380 381 /* 382 * Enable use of ACPI System Resource Affinity Table (SRAT), System 383 * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT) 384 */ 385 int lgrp_plat_srat_enable = 1; 386 int lgrp_plat_slit_enable = 1; 387 int lgrp_plat_msct_enable = 1; 388 389 /* 390 * mnode_xwa: set to non-zero value to initiate workaround if large pages are 391 * found to be crossing memory node boundaries. The workaround will eliminate 392 * a base size page at the end of each memory node boundary to ensure that 393 * a large page with constituent pages that span more than 1 memory node 394 * can never be formed. 395 * 396 */ 397 int mnode_xwa = 1; 398 399 /* 400 * Static array to hold lgroup statistics 401 */ 402 struct lgrp_stats lgrp_stats[NLGRP]; 403 404 405 /* 406 * Forward declarations of platform interface routines 407 */ 408 void plat_build_mem_nodes(struct memlist *list); 409 410 int plat_mnode_xcheck(pfn_t pfncnt); 411 412 lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 413 414 int plat_pfn_to_mem_node(pfn_t pfn); 415 416 /* 417 * Forward declarations of lgroup platform interface routines 418 */ 419 lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 420 421 void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 422 423 lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 424 425 void lgrp_plat_init(lgrp_init_stages_t stage); 426 427 int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 428 429 int lgrp_plat_max_lgrps(void); 430 431 pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 432 lgrp_mem_query_t query); 433 434 lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 435 436 void lgrp_plat_probe(void); 437 438 lgrp_handle_t lgrp_plat_root_hand(void); 439 440 441 /* 442 * Forward declarations of local routines 443 */ 444 static int is_opteron(void); 445 446 static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 447 int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, 448 uint32_t domain); 449 450 static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 451 int cpu_node_nentries); 452 453 static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 454 int node_cnt, uint32_t domain); 455 456 static void lgrp_plat_get_numa_config(void); 457 458 static void lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, 459 lgrp_plat_latency_stats_t *lat_stats, 460 lgrp_plat_probe_stats_t *probe_stats); 461 462 static int lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, 463 lgrp_plat_latency_stats_t *lat_stats); 464 465 static void lgrp_plat_main_init(void); 466 467 static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 468 469 static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 470 int node_cnt, uint32_t domain); 471 472 static int lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, 473 int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt, 474 uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id); 475 476 static void lgrp_plat_node_sort(node_domain_map_t *node_domain, 477 int node_cnt, cpu_node_map_t *cpu_node, int cpu_count, 478 memnode_phys_addr_map_t *memnode_info); 479 480 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 481 int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, 482 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); 483 484 static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); 485 486 static int lgrp_plat_process_slit(struct slit *tp, 487 node_domain_map_t *node_domain, uint_t node_cnt, 488 memnode_phys_addr_map_t *memnode_info, 489 lgrp_plat_latency_stats_t *lat_stats); 490 491 static int lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info, 492 uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, 493 lgrp_plat_latency_stats_t *lat_stats); 494 495 static int lgrp_plat_process_srat(struct srat *tp, struct msct *mp, 496 uint32_t *prox_domain_min, node_domain_map_t *node_domain, 497 cpu_node_map_t *cpu_node, int cpu_count, 498 memnode_phys_addr_map_t *memnode_info); 499 500 static void lgrp_plat_release_bootstrap(void); 501 502 static int lgrp_plat_srat_domains(struct srat *tp, 503 uint32_t *prox_domain_min); 504 505 static int lgrp_plat_msct_domains(struct msct *tp, 506 uint32_t *prox_domain_min); 507 508 static void lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats); 509 510 static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 511 memnode_phys_addr_map_t *memnode_info); 512 513 static hrtime_t opt_probe_vendor(int dest_node, int nreads); 514 515 516 /* 517 * PLATFORM INTERFACE ROUTINES 518 */ 519 520 /* 521 * Configure memory nodes for machines with more than one node (ie NUMA) 522 */ 523 void 524 plat_build_mem_nodes(struct memlist *list) 525 { 526 pfn_t cur_start; /* start addr of subrange */ 527 pfn_t cur_end; /* end addr of subrange */ 528 pfn_t start; /* start addr of whole range */ 529 pfn_t end; /* end addr of whole range */ 530 pgcnt_t endcnt; /* pages to sacrifice */ 531 532 /* 533 * Boot install lists are arranged <addr, len>, ... 534 */ 535 while (list) { 536 int node; 537 538 start = list->ml_address >> PAGESHIFT; 539 end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT; 540 541 if (start > physmax) { 542 list = list->ml_next; 543 continue; 544 } 545 if (end > physmax) 546 end = physmax; 547 548 /* 549 * When there is only one memnode, just add memory to memnode 550 */ 551 if (max_mem_nodes == 1) { 552 mem_node_add_slice(start, end); 553 list = list->ml_next; 554 continue; 555 } 556 557 /* 558 * mem_node_add_slice() expects to get a memory range that 559 * is within one memnode, so need to split any memory range 560 * that spans multiple memnodes into subranges that are each 561 * contained within one memnode when feeding them to 562 * mem_node_add_slice() 563 */ 564 cur_start = start; 565 do { 566 node = plat_pfn_to_mem_node(cur_start); 567 568 /* 569 * Panic if DRAM address map registers or SRAT say 570 * memory in node doesn't exist or address from 571 * boot installed memory list entry isn't in this node. 572 * This shouldn't happen and rest of code can't deal 573 * with this if it does. 574 */ 575 if (node < 0 || node >= lgrp_plat_max_mem_node || 576 !lgrp_plat_memnode_info[node].exists || 577 cur_start < lgrp_plat_memnode_info[node].start || 578 cur_start > lgrp_plat_memnode_info[node].end) { 579 cmn_err(CE_PANIC, "Don't know which memnode " 580 "to add installed memory address 0x%lx\n", 581 cur_start); 582 } 583 584 /* 585 * End of current subrange should not span memnodes 586 */ 587 cur_end = end; 588 endcnt = 0; 589 if (lgrp_plat_memnode_info[node].exists && 590 cur_end > lgrp_plat_memnode_info[node].end) { 591 cur_end = lgrp_plat_memnode_info[node].end; 592 if (mnode_xwa > 1) { 593 /* 594 * sacrifice the last page in each 595 * node to eliminate large pages 596 * that span more than 1 memory node. 597 */ 598 endcnt = 1; 599 physinstalled--; 600 } 601 } 602 603 mem_node_add_slice(cur_start, cur_end - endcnt); 604 605 /* 606 * Next subrange starts after end of current one 607 */ 608 cur_start = cur_end + 1; 609 } while (cur_end < end); 610 611 list = list->ml_next; 612 } 613 mem_node_physalign = 0; 614 mem_node_pfn_shift = 0; 615 } 616 617 618 /* 619 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 620 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 621 * a crossing is found and returns 0 otherwise. 622 */ 623 int 624 plat_mnode_xcheck(pfn_t pfncnt) 625 { 626 int node, prevnode = -1, basenode; 627 pfn_t ea, sa; 628 629 for (node = 0; node < lgrp_plat_max_mem_node; node++) { 630 631 if (lgrp_plat_memnode_info[node].exists == 0) 632 continue; 633 634 if (prevnode == -1) { 635 prevnode = node; 636 basenode = node; 637 continue; 638 } 639 640 /* assume x86 node pfn ranges are in increasing order */ 641 ASSERT(lgrp_plat_memnode_info[node].start > 642 lgrp_plat_memnode_info[prevnode].end); 643 644 /* 645 * continue if the starting address of node is not contiguous 646 * with the previous node. 647 */ 648 649 if (lgrp_plat_memnode_info[node].start != 650 (lgrp_plat_memnode_info[prevnode].end + 1)) { 651 basenode = node; 652 prevnode = node; 653 continue; 654 } 655 656 /* check if the starting address of node is pfncnt aligned */ 657 if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) { 658 659 /* 660 * at this point, node starts at an unaligned boundary 661 * and is contiguous with the previous node(s) to 662 * basenode. Check if there is an aligned contiguous 663 * range of length pfncnt that crosses this boundary. 664 */ 665 666 sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end, 667 pfncnt); 668 ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start), 669 pfncnt); 670 671 ASSERT((ea - sa) == pfncnt); 672 if (sa >= lgrp_plat_memnode_info[basenode].start && 673 ea <= (lgrp_plat_memnode_info[node].end + 1)) { 674 /* 675 * large page found to cross mnode boundary. 676 * Return Failure if workaround not enabled. 677 */ 678 if (mnode_xwa == 0) 679 return (1); 680 mnode_xwa++; 681 } 682 } 683 prevnode = node; 684 } 685 return (0); 686 } 687 688 689 lgrp_handle_t 690 plat_mem_node_to_lgrphand(int mnode) 691 { 692 if (max_mem_nodes == 1) 693 return (LGRP_DEFAULT_HANDLE); 694 695 ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node); 696 697 return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand)); 698 } 699 700 int 701 plat_pfn_to_mem_node(pfn_t pfn) 702 { 703 int node; 704 705 if (max_mem_nodes == 1) 706 return (0); 707 708 for (node = 0; node < lgrp_plat_max_mem_node; node++) { 709 /* 710 * Skip nodes with no memory 711 */ 712 if (!lgrp_plat_memnode_info[node].exists) 713 continue; 714 715 membar_consumer(); 716 if (pfn >= lgrp_plat_memnode_info[node].start && 717 pfn <= lgrp_plat_memnode_info[node].end) 718 return (node); 719 } 720 721 /* 722 * Didn't find memnode where this PFN lives which should never happen 723 */ 724 ASSERT(node < lgrp_plat_max_mem_node); 725 return (-1); 726 } 727 728 729 /* 730 * LGROUP PLATFORM INTERFACE ROUTINES 731 */ 732 733 /* 734 * Allocate additional space for an lgroup. 735 */ 736 lgrp_t * 737 lgrp_plat_alloc(lgrp_id_t lgrpid) 738 { 739 lgrp_t *lgrp; 740 741 lgrp = &lgrp_space[nlgrps_alloc++]; 742 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 743 return (NULL); 744 return (lgrp); 745 } 746 747 748 /* 749 * Platform handling for (re)configuration changes 750 * 751 * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug: 752 * 1) Use cpu_lock to synchronize between lgrp_plat_config() and 753 * lgrp_plat_cpu_to_hand(). 754 * 2) Disable latency probing logic by making sure that the flag 755 * LGRP_PLAT_PROBE_ENABLE is cleared. 756 * 757 * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug: 758 * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal. 759 * 2) Only expansion to existing entries, no shrinking. 760 * 3) On writing side, DR framework ensures that lgrp_plat_config() is called 761 * in single-threaded context. And membar_producer() is used to ensure that 762 * all changes are visible to other CPUs before setting the "exists" flag. 763 * 4) On reading side, membar_consumer() after checking the "exists" flag 764 * ensures that right values are retrieved. 765 * 766 * Mechanism to protect lgrp_plat_node_domain[] at hotplug: 767 * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal. 768 * 2) On writing side, it's single-threaded and membar_producer() is used to 769 * ensure all changes are visible to other CPUs before setting the "exists" 770 * flag. 771 * 3) On reading side, membar_consumer() after checking the "exists" flag 772 * ensures that right values are retrieved. 773 */ 774 void 775 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 776 { 777 #ifdef __xpv 778 _NOTE(ARGUNUSED(flag, arg)); 779 #else 780 int rc, node; 781 cpu_t *cp; 782 void *hdl = NULL; 783 uchar_t *sliptr = NULL; 784 uint32_t domain, apicid, slicnt = 0; 785 update_membounds_t *mp; 786 787 extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *, 788 uint32_t *, uint32_t *, uchar_t **); 789 extern void acpidev_dr_free_cpu_numa_info(void *); 790 791 /* 792 * This interface is used to support CPU/memory DR operations. 793 * Don't bother here if it's still during boot or only one lgrp node 794 * is supported. 795 */ 796 if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1) 797 return; 798 799 switch (flag) { 800 case LGRP_CONFIG_CPU_ADD: 801 cp = (cpu_t *)arg; 802 ASSERT(cp != NULL); 803 ASSERT(MUTEX_HELD(&cpu_lock)); 804 805 /* Check whether CPU already exists. */ 806 ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists); 807 if (lgrp_plat_cpu_node[cp->cpu_id].exists) { 808 cmn_err(CE_WARN, 809 "!lgrp: CPU(%d) already exists in cpu_node map.", 810 cp->cpu_id); 811 break; 812 } 813 814 /* Query CPU lgrp information. */ 815 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, 816 &slicnt, &sliptr); 817 ASSERT(rc == 0); 818 if (rc != 0) { 819 cmn_err(CE_WARN, 820 "!lgrp: failed to query lgrp info for CPU(%d).", 821 cp->cpu_id); 822 break; 823 } 824 825 /* Update node to proximity domain mapping */ 826 node = lgrp_plat_domain_to_node(lgrp_plat_node_domain, 827 lgrp_plat_node_cnt, domain); 828 if (node == -1) { 829 node = lgrp_plat_node_domain_update( 830 lgrp_plat_node_domain, lgrp_plat_node_cnt, domain); 831 ASSERT(node != -1); 832 if (node == -1) { 833 acpidev_dr_free_cpu_numa_info(hdl); 834 cmn_err(CE_WARN, "!lgrp: failed to update " 835 "node_domain map for domain(%u).", domain); 836 break; 837 } 838 } 839 840 /* Update latency information among lgrps. */ 841 if (slicnt != 0 && sliptr != NULL) { 842 if (lgrp_plat_process_sli(domain, sliptr, slicnt, 843 lgrp_plat_node_domain, lgrp_plat_node_cnt, 844 &lgrp_plat_lat_stats) != 0) { 845 cmn_err(CE_WARN, "!lgrp: failed to update " 846 "latency information for domain (%u).", 847 domain); 848 } 849 } 850 851 /* Update CPU to node mapping. */ 852 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain; 853 lgrp_plat_cpu_node[cp->cpu_id].node = node; 854 lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid; 855 lgrp_plat_cpu_node[cp->cpu_id].exists = 1; 856 lgrp_plat_apic_ncpus++; 857 858 acpidev_dr_free_cpu_numa_info(hdl); 859 break; 860 861 case LGRP_CONFIG_CPU_DEL: 862 cp = (cpu_t *)arg; 863 ASSERT(cp != NULL); 864 ASSERT(MUTEX_HELD(&cpu_lock)); 865 866 /* Check whether CPU exists. */ 867 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists); 868 if (!lgrp_plat_cpu_node[cp->cpu_id].exists) { 869 cmn_err(CE_WARN, 870 "!lgrp: CPU(%d) doesn't exist in cpu_node map.", 871 cp->cpu_id); 872 break; 873 } 874 875 /* Query CPU lgrp information. */ 876 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, 877 NULL, NULL); 878 ASSERT(rc == 0); 879 if (rc != 0) { 880 cmn_err(CE_WARN, 881 "!lgrp: failed to query lgrp info for CPU(%d).", 882 cp->cpu_id); 883 break; 884 } 885 886 /* Update map. */ 887 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid); 888 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain); 889 lgrp_plat_cpu_node[cp->cpu_id].exists = 0; 890 lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX; 891 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX; 892 lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX; 893 lgrp_plat_apic_ncpus--; 894 895 acpidev_dr_free_cpu_numa_info(hdl); 896 break; 897 898 case LGRP_CONFIG_MEM_ADD: 899 mp = (update_membounds_t *)arg; 900 ASSERT(mp != NULL); 901 902 /* Update latency information among lgrps. */ 903 if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) { 904 if (lgrp_plat_process_sli(mp->u_domain, 905 mp->u_sli_ptr, mp->u_sli_cnt, 906 lgrp_plat_node_domain, lgrp_plat_node_cnt, 907 &lgrp_plat_lat_stats) != 0) { 908 cmn_err(CE_WARN, "!lgrp: failed to update " 909 "latency information for domain (%u).", 910 domain); 911 } 912 } 913 914 if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain, 915 lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes, 916 mp->u_base, mp->u_base + mp->u_length, 917 mp->u_domain, mp->u_device_id) < 0) { 918 cmn_err(CE_WARN, 919 "!lgrp: failed to update latency information for " 920 "memory (0x%" PRIx64 " - 0x%" PRIx64 ").", 921 mp->u_base, mp->u_base + mp->u_length); 922 } 923 break; 924 925 default: 926 break; 927 } 928 #endif /* __xpv */ 929 } 930 931 932 /* 933 * Return the platform handle for the lgroup containing the given CPU 934 */ 935 lgrp_handle_t 936 lgrp_plat_cpu_to_hand(processorid_t id) 937 { 938 lgrp_handle_t hand; 939 940 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 941 942 if (lgrp_plat_node_cnt == 1) 943 return (LGRP_DEFAULT_HANDLE); 944 945 hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 946 lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries); 947 948 ASSERT(hand != (lgrp_handle_t)-1); 949 if (hand == (lgrp_handle_t)-1) 950 return (LGRP_NULL_HANDLE); 951 952 return (hand); 953 } 954 955 956 /* 957 * Platform-specific initialization of lgroups 958 */ 959 void 960 lgrp_plat_init(lgrp_init_stages_t stage) 961 { 962 #if defined(__xpv) 963 #else /* __xpv */ 964 u_longlong_t value; 965 #endif /* __xpv */ 966 967 switch (stage) { 968 case LGRP_INIT_STAGE1: 969 #if defined(__xpv) 970 /* 971 * XXPV For now, the hypervisor treats all memory equally. 972 */ 973 lgrp_plat_node_cnt = max_mem_nodes = 1; 974 #else /* __xpv */ 975 976 /* 977 * Get boot property for lgroup topology height limit 978 */ 979 if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0) 980 (void) lgrp_topo_ht_limit_set((int)value); 981 982 /* 983 * Get boot property for enabling/disabling SRAT 984 */ 985 if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0) 986 lgrp_plat_srat_enable = (int)value; 987 988 /* 989 * Get boot property for enabling/disabling SLIT 990 */ 991 if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0) 992 lgrp_plat_slit_enable = (int)value; 993 994 /* 995 * Get boot property for enabling/disabling MSCT 996 */ 997 if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0) 998 lgrp_plat_msct_enable = (int)value; 999 1000 /* 1001 * Initialize as a UMA machine 1002 */ 1003 if (lgrp_topo_ht_limit() == 1) { 1004 lgrp_plat_node_cnt = max_mem_nodes = 1; 1005 lgrp_plat_max_mem_node = 1; 1006 return; 1007 } 1008 1009 lgrp_plat_get_numa_config(); 1010 1011 /* 1012 * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes 1013 * to support memory DR operations if memory DR is enabled. 1014 */ 1015 lgrp_plat_max_mem_node = lgrp_plat_node_cnt; 1016 if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) { 1017 max_mem_nodes = MAX_MEM_NODES_PER_LGROUP * 1018 lgrp_plat_node_cnt; 1019 ASSERT(max_mem_nodes <= MAX_MEM_NODES); 1020 } 1021 #endif /* __xpv */ 1022 break; 1023 1024 case LGRP_INIT_STAGE3: 1025 lgrp_plat_probe(); 1026 lgrp_plat_release_bootstrap(); 1027 break; 1028 1029 case LGRP_INIT_STAGE4: 1030 lgrp_plat_main_init(); 1031 break; 1032 1033 default: 1034 break; 1035 } 1036 } 1037 1038 1039 /* 1040 * Return latency between "from" and "to" lgroups 1041 * 1042 * This latency number can only be used for relative comparison 1043 * between lgroups on the running system, cannot be used across platforms, 1044 * and may not reflect the actual latency. It is platform and implementation 1045 * specific, so platform gets to decide its value. It would be nice if the 1046 * number was at least proportional to make comparisons more meaningful though. 1047 */ 1048 int 1049 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1050 { 1051 lgrp_handle_t src, dest; 1052 int node; 1053 1054 if (max_mem_nodes == 1) 1055 return (0); 1056 1057 /* 1058 * Return max latency for root lgroup 1059 */ 1060 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1061 return (lgrp_plat_lat_stats.latency_max); 1062 1063 src = from; 1064 dest = to; 1065 1066 /* 1067 * Return 0 for nodes (lgroup platform handles) out of range 1068 */ 1069 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1070 return (0); 1071 1072 /* 1073 * Probe from current CPU if its lgroup latencies haven't been set yet 1074 * and we are trying to get latency from current CPU to some node. 1075 * Avoid probing if CPU/memory DR is enabled. 1076 */ 1077 if (lgrp_plat_lat_stats.latencies[src][src] == 0) { 1078 /* 1079 * Latency information should be updated by lgrp_plat_config() 1080 * for DR operations. Something is wrong if reaches here. 1081 * For safety, flatten lgrp topology to two levels. 1082 */ 1083 if (plat_dr_support_cpu() || plat_dr_support_memory()) { 1084 ASSERT(lgrp_plat_lat_stats.latencies[src][src]); 1085 cmn_err(CE_WARN, 1086 "lgrp: failed to get latency information, " 1087 "fall back to two-level topology."); 1088 lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 1089 } else { 1090 node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1091 lgrp_plat_cpu_node_nentries); 1092 ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 1093 if (node == src) 1094 lgrp_plat_probe(); 1095 } 1096 } 1097 1098 return (lgrp_plat_lat_stats.latencies[src][dest]); 1099 } 1100 1101 1102 /* 1103 * Return the maximum number of lgrps supported by the platform. 1104 * Before lgrp topology is known it returns an estimate based on the number of 1105 * nodes. Once topology is known it returns: 1106 * 1) the actual maximim number of lgrps created if CPU/memory DR operations 1107 * are not suppported. 1108 * 2) the maximum possible number of lgrps if CPU/memory DR operations are 1109 * supported. 1110 */ 1111 int 1112 lgrp_plat_max_lgrps(void) 1113 { 1114 if (!lgrp_topo_initialized || plat_dr_support_cpu() || 1115 plat_dr_support_memory()) { 1116 return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1117 } else { 1118 return (lgrp_alloc_max + 1); 1119 } 1120 } 1121 1122 1123 /* 1124 * Count number of memory pages (_t) based on mnode id (_n) and query type (_t). 1125 */ 1126 #define _LGRP_PLAT_MEM_SIZE(_n, _q, _t) \ 1127 if (mem_node_config[_n].exists) { \ 1128 switch (_q) { \ 1129 case LGRP_MEM_SIZE_FREE: \ 1130 _t += MNODE_PGCNT(_n); \ 1131 break; \ 1132 case LGRP_MEM_SIZE_AVAIL: \ 1133 _t += mem_node_memlist_pages(_n, phys_avail); \ 1134 break; \ 1135 case LGRP_MEM_SIZE_INSTALL: \ 1136 _t += mem_node_memlist_pages(_n, phys_install); \ 1137 break; \ 1138 default: \ 1139 break; \ 1140 } \ 1141 } 1142 1143 /* 1144 * Return the number of free pages in an lgroup. 1145 * 1146 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1147 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1148 * number of allocatable base pagesize pages corresponding to the 1149 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1150 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1151 * memory installed, regardless of whether or not it's usable. 1152 */ 1153 pgcnt_t 1154 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1155 { 1156 int mnode; 1157 pgcnt_t npgs = (pgcnt_t)0; 1158 extern struct memlist *phys_avail; 1159 extern struct memlist *phys_install; 1160 1161 1162 if (plathand == LGRP_DEFAULT_HANDLE) 1163 return (lgrp_plat_mem_size_default(plathand, query)); 1164 1165 if (plathand != LGRP_NULL_HANDLE) { 1166 /* Count memory node present at boot. */ 1167 mnode = (int)plathand; 1168 ASSERT(mnode < lgrp_plat_node_cnt); 1169 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); 1170 1171 /* Count possible hot-added memory nodes. */ 1172 for (mnode = lgrp_plat_node_cnt; 1173 mnode < lgrp_plat_max_mem_node; mnode++) { 1174 if (lgrp_plat_memnode_info[mnode].lgrphand == plathand) 1175 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); 1176 } 1177 } 1178 1179 return (npgs); 1180 } 1181 1182 1183 /* 1184 * Return the platform handle of the lgroup that contains the physical memory 1185 * corresponding to the given page frame number 1186 */ 1187 lgrp_handle_t 1188 lgrp_plat_pfn_to_hand(pfn_t pfn) 1189 { 1190 int mnode; 1191 1192 if (max_mem_nodes == 1) 1193 return (LGRP_DEFAULT_HANDLE); 1194 1195 if (pfn > physmax) 1196 return (LGRP_NULL_HANDLE); 1197 1198 mnode = plat_pfn_to_mem_node(pfn); 1199 if (mnode < 0) 1200 return (LGRP_NULL_HANDLE); 1201 1202 return (MEM_NODE_2_LGRPHAND(mnode)); 1203 } 1204 1205 1206 /* 1207 * Probe memory in each node from current CPU to determine latency topology 1208 * 1209 * The probing code will probe the vendor ID register on the Northbridge of 1210 * Opteron processors and probe memory for other processors by default. 1211 * 1212 * Since probing is inherently error prone, the code takes laps across all the 1213 * nodes probing from each node to each of the other nodes some number of 1214 * times. Furthermore, each node is probed some number of times before moving 1215 * onto the next one during each lap. The minimum latency gotten between nodes 1216 * is kept as the latency between the nodes. 1217 * 1218 * After all that, the probe times are adjusted by normalizing values that are 1219 * close to each other and local latencies are made the same. Lastly, the 1220 * latencies are verified to make sure that certain conditions are met (eg. 1221 * local < remote, latency(a, b) == latency(b, a), etc.). 1222 * 1223 * If any of the conditions aren't met, the code will export a NUMA 1224 * configuration with the local CPUs and memory given by the SRAT or PCI config 1225 * space registers and one remote memory latency since it can't tell exactly 1226 * how far each node is from each other. 1227 */ 1228 void 1229 lgrp_plat_probe(void) 1230 { 1231 int from; 1232 int i; 1233 lgrp_plat_latency_stats_t *lat_stats; 1234 boolean_t probed; 1235 hrtime_t probe_time; 1236 int to; 1237 1238 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1239 max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1240 return; 1241 1242 /* SRAT and SLIT should be enabled if DR operations are enabled. */ 1243 if (plat_dr_support_cpu() || plat_dr_support_memory()) 1244 return; 1245 1246 /* 1247 * Determine ID of node containing current CPU 1248 */ 1249 from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1250 lgrp_plat_cpu_node_nentries); 1251 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1252 if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 1253 ASSERT(lgrp_plat_node_domain[from].exists); 1254 1255 /* 1256 * Don't need to probe if got times already 1257 */ 1258 lat_stats = &lgrp_plat_lat_stats; 1259 if (lat_stats->latencies[from][from] != 0) 1260 return; 1261 1262 /* 1263 * Read vendor ID in Northbridge or read and write page(s) 1264 * in each node from current CPU and remember how long it takes, 1265 * so we can build latency topology of machine later. 1266 * This should approximate the memory latency between each node. 1267 */ 1268 probed = B_FALSE; 1269 for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1270 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1271 /* 1272 * Get probe time and skip over any nodes that can't be 1273 * probed yet or don't have memory 1274 */ 1275 probe_time = lgrp_plat_probe_time(to, 1276 lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries, 1277 &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats, 1278 &lgrp_plat_probe_stats); 1279 if (probe_time == 0) 1280 continue; 1281 1282 probed = B_TRUE; 1283 1284 /* 1285 * Keep lowest probe time as latency between nodes 1286 */ 1287 if (lat_stats->latencies[from][to] == 0 || 1288 probe_time < lat_stats->latencies[from][to]) 1289 lat_stats->latencies[from][to] = probe_time; 1290 1291 /* 1292 * Update overall minimum and maximum probe times 1293 * across all nodes 1294 */ 1295 if (probe_time < lat_stats->latency_min || 1296 lat_stats->latency_min == -1) 1297 lat_stats->latency_min = probe_time; 1298 if (probe_time > lat_stats->latency_max) 1299 lat_stats->latency_max = probe_time; 1300 } 1301 } 1302 1303 /* 1304 * Bail out if weren't able to probe any nodes from current CPU 1305 */ 1306 if (probed == B_FALSE) 1307 return; 1308 1309 /* 1310 * - Fix up latencies such that local latencies are same, 1311 * latency(i, j) == latency(j, i), etc. (if possible) 1312 * 1313 * - Verify that latencies look ok 1314 * 1315 * - Fallback to just optimizing for local and remote if 1316 * latencies didn't look right 1317 */ 1318 lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats, 1319 &lgrp_plat_probe_stats); 1320 lgrp_plat_probe_stats.probe_error_code = 1321 lgrp_plat_latency_verify(lgrp_plat_memnode_info, 1322 &lgrp_plat_lat_stats); 1323 if (lgrp_plat_probe_stats.probe_error_code) 1324 lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 1325 } 1326 1327 1328 /* 1329 * Return platform handle for root lgroup 1330 */ 1331 lgrp_handle_t 1332 lgrp_plat_root_hand(void) 1333 { 1334 return (LGRP_DEFAULT_HANDLE); 1335 } 1336 1337 1338 /* 1339 * INTERNAL ROUTINES 1340 */ 1341 1342 1343 /* 1344 * Update CPU to node mapping for given CPU and proximity domain. 1345 * Return values: 1346 * - zero for success 1347 * - positive numbers for warnings 1348 * - negative numbers for errors 1349 */ 1350 static int 1351 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, 1352 cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain) 1353 { 1354 uint_t i; 1355 int node; 1356 1357 /* 1358 * Get node number for proximity domain 1359 */ 1360 node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 1361 if (node == -1) { 1362 node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1363 domain); 1364 if (node == -1) 1365 return (-1); 1366 } 1367 1368 /* 1369 * Search for entry with given APIC ID and fill in its node and 1370 * proximity domain IDs (if they haven't been set already) 1371 */ 1372 for (i = 0; i < nentries; i++) { 1373 /* 1374 * Skip nonexistent entries and ones without matching APIC ID 1375 */ 1376 if (!cpu_node[i].exists || cpu_node[i].apicid != apicid) 1377 continue; 1378 1379 /* 1380 * Just return if entry completely and correctly filled in 1381 * already 1382 */ 1383 if (cpu_node[i].prox_domain == domain && 1384 cpu_node[i].node == node) 1385 return (1); 1386 1387 /* 1388 * It's invalid to have more than one entry with the same 1389 * local APIC ID in SRAT table. 1390 */ 1391 if (cpu_node[i].node != UINT_MAX) 1392 return (-2); 1393 1394 /* 1395 * Fill in node and proximity domain IDs 1396 */ 1397 cpu_node[i].prox_domain = domain; 1398 cpu_node[i].node = node; 1399 1400 return (0); 1401 } 1402 1403 /* 1404 * It's possible that an apicid doesn't exist in the cpu_node map due 1405 * to user limits number of CPUs powered on at boot by specifying the 1406 * boot_ncpus kernel option. 1407 */ 1408 return (2); 1409 } 1410 1411 1412 /* 1413 * Get node ID for given CPU 1414 */ 1415 static int 1416 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 1417 int cpu_node_nentries) 1418 { 1419 processorid_t cpuid; 1420 1421 if (cp == NULL) 1422 return (-1); 1423 1424 cpuid = cp->cpu_id; 1425 if (cpuid < 0 || cpuid >= max_ncpus) 1426 return (-1); 1427 1428 /* 1429 * SRAT doesn't exist, isn't enabled, or there was an error processing 1430 * it, so return node ID for Opteron and -1 otherwise. 1431 */ 1432 if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1433 lgrp_plat_srat_error) { 1434 if (is_opteron()) 1435 return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE)); 1436 return (-1); 1437 } 1438 1439 /* 1440 * Return -1 when CPU to node ID mapping entry doesn't exist for given 1441 * CPU 1442 */ 1443 if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists) 1444 return (-1); 1445 1446 return (cpu_node[cpuid].node); 1447 } 1448 1449 1450 /* 1451 * Return node number for given proximity domain/system locality 1452 */ 1453 static int 1454 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, 1455 uint32_t domain) 1456 { 1457 uint_t node; 1458 uint_t start; 1459 1460 /* 1461 * Hash proximity domain ID into node to domain mapping table (array), 1462 * search for entry with matching proximity domain ID, and return index 1463 * of matching entry as node ID. 1464 */ 1465 node = start = NODE_DOMAIN_HASH(domain, node_cnt); 1466 do { 1467 if (node_domain[node].exists) { 1468 membar_consumer(); 1469 if (node_domain[node].prox_domain == domain) 1470 return (node); 1471 } 1472 node = (node + 1) % node_cnt; 1473 } while (node != start); 1474 return (-1); 1475 } 1476 1477 1478 /* 1479 * Get NUMA configuration of machine 1480 */ 1481 static void 1482 lgrp_plat_get_numa_config(void) 1483 { 1484 uint_t probe_op; 1485 1486 /* 1487 * Read boot property with CPU to APIC ID mapping table/array to 1488 * determine number of CPUs 1489 */ 1490 lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL); 1491 1492 /* 1493 * Determine which CPUs and memory are local to each other and number 1494 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 1495 */ 1496 if (lgrp_plat_apic_ncpus > 0) { 1497 int retval; 1498 1499 /* Reserve enough resources if CPU DR is enabled. */ 1500 if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus) 1501 lgrp_plat_cpu_node_nentries = max_ncpus; 1502 else 1503 lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; 1504 1505 /* 1506 * Temporarily allocate boot memory to use for CPU to node 1507 * mapping since kernel memory allocator isn't alive yet 1508 */ 1509 lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops, 1510 NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t), 1511 sizeof (int)); 1512 1513 ASSERT(lgrp_plat_cpu_node != NULL); 1514 if (lgrp_plat_cpu_node) { 1515 bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries * 1516 sizeof (cpu_node_map_t)); 1517 } else { 1518 lgrp_plat_cpu_node_nentries = 0; 1519 } 1520 1521 /* 1522 * Fill in CPU to node ID mapping table with APIC ID for each 1523 * CPU 1524 */ 1525 (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); 1526 1527 retval = lgrp_plat_process_srat(srat_ptr, msct_ptr, 1528 &lgrp_plat_prox_domain_min, 1529 lgrp_plat_node_domain, lgrp_plat_cpu_node, 1530 lgrp_plat_apic_ncpus, lgrp_plat_memnode_info); 1531 if (retval <= 0) { 1532 lgrp_plat_srat_error = retval; 1533 lgrp_plat_node_cnt = 1; 1534 } else { 1535 lgrp_plat_srat_error = 0; 1536 lgrp_plat_node_cnt = retval; 1537 } 1538 } 1539 1540 /* 1541 * Try to use PCI config space registers on Opteron if there's an error 1542 * processing CPU to APIC ID mapping or SRAT 1543 */ 1544 if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && 1545 is_opteron()) 1546 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 1547 lgrp_plat_memnode_info); 1548 1549 /* 1550 * Don't bother to setup system for multiple lgroups and only use one 1551 * memory node when memory is interleaved between any nodes or there is 1552 * only one NUMA node 1553 */ 1554 if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 1555 lgrp_plat_node_cnt = max_mem_nodes = 1; 1556 (void) lgrp_topo_ht_limit_set(1); 1557 return; 1558 } 1559 1560 /* 1561 * Leaf lgroups on x86/x64 architectures contain one physical 1562 * processor chip. Tune lgrp_expand_proc_thresh and 1563 * lgrp_expand_proc_diff so that lgrp_choose() will spread 1564 * things out aggressively. 1565 */ 1566 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 1567 lgrp_expand_proc_diff = 0; 1568 1569 /* 1570 * There should be one memnode (physical page free list(s)) for 1571 * each node if memory DR is disabled. 1572 */ 1573 max_mem_nodes = lgrp_plat_node_cnt; 1574 1575 /* 1576 * Initialize min and max latency before reading SLIT or probing 1577 */ 1578 lgrp_plat_lat_stats.latency_min = -1; 1579 lgrp_plat_lat_stats.latency_max = 0; 1580 1581 /* 1582 * Determine how far each NUMA node is from each other by 1583 * reading ACPI System Locality Information Table (SLIT) if it 1584 * exists 1585 */ 1586 lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 1587 lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info, 1588 &lgrp_plat_lat_stats); 1589 1590 /* 1591 * Disable support of CPU/memory DR operations if multiple locality 1592 * domains exist in system and either of following is true. 1593 * 1) Failed to process SLIT table. 1594 * 2) Latency probing is enabled by user. 1595 */ 1596 if (lgrp_plat_node_cnt > 1 && 1597 (plat_dr_support_cpu() || plat_dr_support_memory())) { 1598 if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 || 1599 !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 || 1600 lgrp_plat_apic_ncpus <= 0) { 1601 cmn_err(CE_CONT, 1602 "?lgrp: failed to process ACPI SRAT/SLIT table, " 1603 "disable support of CPU/memory DR operations."); 1604 plat_dr_disable_cpu(); 1605 plat_dr_disable_memory(); 1606 } else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) { 1607 cmn_err(CE_CONT, 1608 "?lgrp: latency probing enabled by user, " 1609 "disable support of CPU/memory DR operations."); 1610 plat_dr_disable_cpu(); 1611 plat_dr_disable_memory(); 1612 } 1613 } 1614 1615 /* Done if succeeded to process SLIT table. */ 1616 if (lgrp_plat_slit_error == 0) 1617 return; 1618 1619 /* 1620 * Probe to determine latency between NUMA nodes when SLIT 1621 * doesn't exist or make sense 1622 */ 1623 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 1624 1625 /* 1626 * Specify whether to probe using vendor ID register or page copy 1627 * if hasn't been specified already or is overspecified 1628 */ 1629 probe_op = lgrp_plat_probe_flags & 1630 (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1631 1632 if (probe_op == 0 || 1633 probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 1634 lgrp_plat_probe_flags &= 1635 ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1636 if (is_opteron()) 1637 lgrp_plat_probe_flags |= 1638 LGRP_PLAT_PROBE_VENDOR; 1639 else 1640 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 1641 } 1642 1643 /* 1644 * Probing errors can mess up the lgroup topology and 1645 * force us fall back to a 2 level lgroup topology. 1646 * Here we bound how tall the lgroup topology can grow 1647 * in hopes of avoiding any anamolies in probing from 1648 * messing up the lgroup topology by limiting the 1649 * accuracy of the latency topology. 1650 * 1651 * Assume that nodes will at least be configured in a 1652 * ring, so limit height of lgroup topology to be less 1653 * than number of nodes on a system with 4 or more 1654 * nodes 1655 */ 1656 if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 1657 lgrp_topo_ht_limit_default()) 1658 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 1659 } 1660 1661 1662 /* 1663 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1664 * be considered same 1665 */ 1666 #define LGRP_LAT_TOLERANCE_SHIFT 4 1667 1668 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1669 1670 1671 /* 1672 * Adjust latencies between nodes to be symmetric, normalize latencies between 1673 * any nodes that are within some tolerance to be same, and make local 1674 * latencies be same 1675 */ 1676 static void 1677 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, 1678 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1679 { 1680 int i; 1681 int j; 1682 int k; 1683 int l; 1684 u_longlong_t max; 1685 u_longlong_t min; 1686 u_longlong_t t; 1687 u_longlong_t t1; 1688 u_longlong_t t2; 1689 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1690 int lat_corrected[MAX_NODES][MAX_NODES]; 1691 1692 /* 1693 * Nothing to do when this is an UMA machine or don't have args needed 1694 */ 1695 if (max_mem_nodes == 1) 1696 return; 1697 1698 ASSERT(memnode_info != NULL && lat_stats != NULL && 1699 probe_stats != NULL); 1700 1701 /* 1702 * Make sure that latencies are symmetric between any two nodes 1703 * (ie. latency(node0, node1) == latency(node1, node0)) 1704 */ 1705 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1706 if (!memnode_info[i].exists) 1707 continue; 1708 1709 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1710 if (!memnode_info[j].exists) 1711 continue; 1712 1713 t1 = lat_stats->latencies[i][j]; 1714 t2 = lat_stats->latencies[j][i]; 1715 1716 if (t1 == 0 || t2 == 0 || t1 == t2) 1717 continue; 1718 1719 /* 1720 * Latencies should be same 1721 * - Use minimum of two latencies which should be same 1722 * - Track suspect probe times not within tolerance of 1723 * min value 1724 * - Remember how much values are corrected by 1725 */ 1726 if (t1 > t2) { 1727 t = t2; 1728 probe_stats->probe_errors[i][j] += t1 - t2; 1729 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1730 probe_stats->probe_suspect[i][j]++; 1731 probe_stats->probe_suspect[j][i]++; 1732 } 1733 } else if (t2 > t1) { 1734 t = t1; 1735 probe_stats->probe_errors[j][i] += t2 - t1; 1736 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1737 probe_stats->probe_suspect[i][j]++; 1738 probe_stats->probe_suspect[j][i]++; 1739 } 1740 } 1741 1742 lat_stats->latencies[i][j] = 1743 lat_stats->latencies[j][i] = t; 1744 lgrp_config(cflag, t1, t); 1745 lgrp_config(cflag, t2, t); 1746 } 1747 } 1748 1749 /* 1750 * Keep track of which latencies get corrected 1751 */ 1752 for (i = 0; i < MAX_NODES; i++) 1753 for (j = 0; j < MAX_NODES; j++) 1754 lat_corrected[i][j] = 0; 1755 1756 /* 1757 * For every two nodes, see whether there is another pair of nodes which 1758 * are about the same distance apart and make the latencies be the same 1759 * if they are close enough together 1760 */ 1761 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1762 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1763 if (!memnode_info[j].exists) 1764 continue; 1765 /* 1766 * Pick one pair of nodes (i, j) 1767 * and get latency between them 1768 */ 1769 t1 = lat_stats->latencies[i][j]; 1770 1771 /* 1772 * Skip this pair of nodes if there isn't a latency 1773 * for it yet 1774 */ 1775 if (t1 == 0) 1776 continue; 1777 1778 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1779 for (l = 0; l < lgrp_plat_node_cnt; l++) { 1780 if (!memnode_info[l].exists) 1781 continue; 1782 /* 1783 * Pick another pair of nodes (k, l) 1784 * not same as (i, j) and get latency 1785 * between them 1786 */ 1787 if (k == i && l == j) 1788 continue; 1789 1790 t2 = lat_stats->latencies[k][l]; 1791 1792 /* 1793 * Skip this pair of nodes if there 1794 * isn't a latency for it yet 1795 */ 1796 1797 if (t2 == 0) 1798 continue; 1799 1800 /* 1801 * Skip nodes (k, l) if they already 1802 * have same latency as (i, j) or 1803 * their latency isn't close enough to 1804 * be considered/made the same 1805 */ 1806 if (t1 == t2 || (t1 > t2 && t1 - t2 > 1807 t1 >> lgrp_plat_probe_lt_shift) || 1808 (t2 > t1 && t2 - t1 > 1809 t2 >> lgrp_plat_probe_lt_shift)) 1810 continue; 1811 1812 /* 1813 * Make latency(i, j) same as 1814 * latency(k, l), try to use latency 1815 * that has been adjusted already to get 1816 * more consistency (if possible), and 1817 * remember which latencies were 1818 * adjusted for next time 1819 */ 1820 if (lat_corrected[i][j]) { 1821 t = t1; 1822 lgrp_config(cflag, t2, t); 1823 t2 = t; 1824 } else if (lat_corrected[k][l]) { 1825 t = t2; 1826 lgrp_config(cflag, t1, t); 1827 t1 = t; 1828 } else { 1829 if (t1 > t2) 1830 t = t2; 1831 else 1832 t = t1; 1833 lgrp_config(cflag, t1, t); 1834 lgrp_config(cflag, t2, t); 1835 t1 = t2 = t; 1836 } 1837 1838 lat_stats->latencies[i][j] = 1839 lat_stats->latencies[k][l] = t; 1840 1841 lat_corrected[i][j] = 1842 lat_corrected[k][l] = 1; 1843 } 1844 } 1845 } 1846 } 1847 1848 /* 1849 * Local latencies should be same 1850 * - Find min and max local latencies 1851 * - Make all local latencies be minimum 1852 */ 1853 min = -1; 1854 max = 0; 1855 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1856 if (!memnode_info[i].exists) 1857 continue; 1858 t = lat_stats->latencies[i][i]; 1859 if (t == 0) 1860 continue; 1861 if (min == -1 || t < min) 1862 min = t; 1863 if (t > max) 1864 max = t; 1865 } 1866 if (min != max) { 1867 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1868 int local; 1869 1870 if (!memnode_info[i].exists) 1871 continue; 1872 1873 local = lat_stats->latencies[i][i]; 1874 if (local == 0) 1875 continue; 1876 1877 /* 1878 * Track suspect probe times that aren't within 1879 * tolerance of minimum local latency and how much 1880 * probe times are corrected by 1881 */ 1882 if (local - min > min >> lgrp_plat_probe_lt_shift) 1883 probe_stats->probe_suspect[i][i]++; 1884 1885 probe_stats->probe_errors[i][i] += local - min; 1886 1887 /* 1888 * Make local latencies be minimum 1889 */ 1890 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1891 lat_stats->latencies[i][i] = min; 1892 } 1893 } 1894 1895 /* 1896 * Determine max probe time again since just adjusted latencies 1897 */ 1898 lat_stats->latency_max = 0; 1899 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1900 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1901 if (!memnode_info[j].exists) 1902 continue; 1903 t = lat_stats->latencies[i][j]; 1904 if (t > lat_stats->latency_max) 1905 lat_stats->latency_max = t; 1906 } 1907 } 1908 } 1909 1910 1911 /* 1912 * Verify following about latencies between nodes: 1913 * 1914 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1915 * - Local latencies same 1916 * - Local < remote 1917 * - Number of latencies seen is reasonable 1918 * - Number of occurrences of a given latency should be more than 1 1919 * 1920 * Returns: 1921 * 0 Success 1922 * -1 Not symmetric 1923 * -2 Local latencies not same 1924 * -3 Local >= remote 1925 */ 1926 static int 1927 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, 1928 lgrp_plat_latency_stats_t *lat_stats) 1929 { 1930 int i; 1931 int j; 1932 u_longlong_t t1; 1933 u_longlong_t t2; 1934 1935 ASSERT(memnode_info != NULL && lat_stats != NULL); 1936 1937 /* 1938 * Nothing to do when this is an UMA machine, lgroup topology is 1939 * limited to 2 levels, or there aren't any probe times yet 1940 */ 1941 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1942 lat_stats->latencies[0][0] == 0) 1943 return (0); 1944 1945 /* 1946 * Make sure that latencies are symmetric between any two nodes 1947 * (ie. latency(node0, node1) == latency(node1, node0)) 1948 */ 1949 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1950 if (!memnode_info[i].exists) 1951 continue; 1952 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1953 if (!memnode_info[j].exists) 1954 continue; 1955 t1 = lat_stats->latencies[i][j]; 1956 t2 = lat_stats->latencies[j][i]; 1957 1958 if (t1 == 0 || t2 == 0 || t1 == t2) 1959 continue; 1960 1961 return (-1); 1962 } 1963 } 1964 1965 /* 1966 * Local latencies should be same 1967 */ 1968 t1 = lat_stats->latencies[0][0]; 1969 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1970 if (!memnode_info[i].exists) 1971 continue; 1972 1973 t2 = lat_stats->latencies[i][i]; 1974 if (t2 == 0) 1975 continue; 1976 1977 if (t1 == 0) { 1978 t1 = t2; 1979 continue; 1980 } 1981 1982 if (t1 != t2) 1983 return (-2); 1984 } 1985 1986 /* 1987 * Local latencies should be less than remote 1988 */ 1989 if (t1) { 1990 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1991 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1992 if (!memnode_info[j].exists) 1993 continue; 1994 t2 = lat_stats->latencies[i][j]; 1995 if (i == j || t2 == 0) 1996 continue; 1997 1998 if (t1 >= t2) 1999 return (-3); 2000 } 2001 } 2002 } 2003 2004 return (0); 2005 } 2006 2007 2008 /* 2009 * Platform-specific initialization 2010 */ 2011 static void 2012 lgrp_plat_main_init(void) 2013 { 2014 int curnode; 2015 int ht_limit; 2016 int i; 2017 2018 /* 2019 * Print a notice that MPO is disabled when memory is interleaved 2020 * across nodes....Would do this when it is discovered, but can't 2021 * because it happens way too early during boot.... 2022 */ 2023 if (lgrp_plat_mem_intrlv) 2024 cmn_err(CE_NOTE, 2025 "MPO disabled because memory is interleaved\n"); 2026 2027 /* 2028 * Don't bother to do any probing if it is disabled, there is only one 2029 * node, or the height of the lgroup topology less than or equal to 2 2030 */ 2031 ht_limit = lgrp_topo_ht_limit(); 2032 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 2033 max_mem_nodes == 1 || ht_limit <= 2) { 2034 /* 2035 * Setup lgroup latencies for 2 level lgroup topology 2036 * (ie. local and remote only) if they haven't been set yet 2037 */ 2038 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 2039 lgrp_plat_lat_stats.latency_max == 0) 2040 lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 2041 return; 2042 } 2043 2044 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 2045 /* 2046 * Should have been able to probe from CPU 0 when it was added 2047 * to lgroup hierarchy, but may not have been able to then 2048 * because it happens so early in boot that gethrtime() hasn't 2049 * been initialized. (:-( 2050 */ 2051 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 2052 lgrp_plat_cpu_node_nentries); 2053 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 2054 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 2055 lgrp_plat_probe(); 2056 2057 return; 2058 } 2059 2060 /* 2061 * When probing memory, use one page for every sample to determine 2062 * lgroup topology and taking multiple samples 2063 */ 2064 if (lgrp_plat_probe_mem_config.probe_memsize == 0) 2065 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 2066 lgrp_plat_probe_nsamples; 2067 2068 /* 2069 * Map memory in each node needed for probing to determine latency 2070 * topology 2071 */ 2072 for (i = 0; i < lgrp_plat_node_cnt; i++) { 2073 int mnode; 2074 2075 /* 2076 * Skip this node and leave its probe page NULL 2077 * if it doesn't have any memory 2078 */ 2079 mnode = i; 2080 if (!mem_node_config[mnode].exists) { 2081 lgrp_plat_probe_mem_config.probe_va[i] = NULL; 2082 continue; 2083 } 2084 2085 /* 2086 * Allocate one kernel virtual page 2087 */ 2088 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 2089 lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 2090 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 2091 cmn_err(CE_WARN, 2092 "lgrp_plat_main_init: couldn't allocate memory"); 2093 return; 2094 } 2095 2096 /* 2097 * Get PFN for first page in each node 2098 */ 2099 lgrp_plat_probe_mem_config.probe_pfn[i] = 2100 mem_node_config[mnode].physbase; 2101 2102 /* 2103 * Map virtual page to first page in node 2104 */ 2105 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 2106 lgrp_plat_probe_mem_config.probe_memsize, 2107 lgrp_plat_probe_mem_config.probe_pfn[i], 2108 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 2109 HAT_LOAD_NOCONSIST); 2110 } 2111 2112 /* 2113 * Probe from current CPU 2114 */ 2115 lgrp_plat_probe(); 2116 } 2117 2118 2119 /* 2120 * Return the number of free, allocatable, or installed 2121 * pages in an lgroup 2122 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 2123 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 2124 */ 2125 static pgcnt_t 2126 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 2127 { 2128 _NOTE(ARGUNUSED(lgrphand)); 2129 2130 struct memlist *mlist; 2131 pgcnt_t npgs = 0; 2132 extern struct memlist *phys_avail; 2133 extern struct memlist *phys_install; 2134 2135 switch (query) { 2136 case LGRP_MEM_SIZE_FREE: 2137 return ((pgcnt_t)freemem); 2138 case LGRP_MEM_SIZE_AVAIL: 2139 memlist_read_lock(); 2140 for (mlist = phys_avail; mlist; mlist = mlist->ml_next) 2141 npgs += btop(mlist->ml_size); 2142 memlist_read_unlock(); 2143 return (npgs); 2144 case LGRP_MEM_SIZE_INSTALL: 2145 memlist_read_lock(); 2146 for (mlist = phys_install; mlist; mlist = mlist->ml_next) 2147 npgs += btop(mlist->ml_size); 2148 memlist_read_unlock(); 2149 return (npgs); 2150 default: 2151 return ((pgcnt_t)0); 2152 } 2153 } 2154 2155 2156 /* 2157 * Update node to proximity domain mappings for given domain and return node ID 2158 */ 2159 static int 2160 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, 2161 uint32_t domain) 2162 { 2163 uint_t node; 2164 uint_t start; 2165 2166 /* 2167 * Hash proximity domain ID into node to domain mapping table (array) 2168 * and add entry for it into first non-existent or matching entry found 2169 */ 2170 node = start = NODE_DOMAIN_HASH(domain, node_cnt); 2171 do { 2172 /* 2173 * Entry doesn't exist yet, so create one for this proximity 2174 * domain and return node ID which is index into mapping table. 2175 */ 2176 if (!node_domain[node].exists) { 2177 node_domain[node].prox_domain = domain; 2178 membar_producer(); 2179 node_domain[node].exists = 1; 2180 return (node); 2181 } 2182 2183 /* 2184 * Entry exists for this proximity domain already, so just 2185 * return node ID (index into table). 2186 */ 2187 if (node_domain[node].prox_domain == domain) 2188 return (node); 2189 node = NODE_DOMAIN_HASH(node + 1, node_cnt); 2190 } while (node != start); 2191 2192 /* 2193 * Ran out of supported number of entries which shouldn't happen.... 2194 */ 2195 ASSERT(node != start); 2196 return (-1); 2197 } 2198 2199 /* 2200 * Update node memory information for given proximity domain with specified 2201 * starting and ending physical address range (and return positive numbers for 2202 * success and negative ones for errors) 2203 */ 2204 static int 2205 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt, 2206 memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start, 2207 uint64_t end, uint32_t domain, uint32_t device_id) 2208 { 2209 int node, mnode; 2210 2211 /* 2212 * Get node number for proximity domain 2213 */ 2214 node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 2215 if (node == -1) { 2216 node = lgrp_plat_node_domain_update(node_domain, node_cnt, 2217 domain); 2218 if (node == -1) 2219 return (-1); 2220 } 2221 2222 /* 2223 * This function is called during boot if device_id is 2224 * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for 2225 * memory DR operations. 2226 */ 2227 if (device_id != ACPI_MEMNODE_DEVID_BOOT) { 2228 ASSERT(lgrp_plat_max_mem_node <= memnode_cnt); 2229 2230 for (mnode = lgrp_plat_node_cnt; 2231 mnode < lgrp_plat_max_mem_node; mnode++) { 2232 if (memnode_info[mnode].exists && 2233 memnode_info[mnode].prox_domain == domain && 2234 memnode_info[mnode].device_id == device_id) { 2235 if (btop(start) < memnode_info[mnode].start) 2236 memnode_info[mnode].start = btop(start); 2237 if (btop(end) > memnode_info[mnode].end) 2238 memnode_info[mnode].end = btop(end); 2239 return (1); 2240 } 2241 } 2242 2243 if (lgrp_plat_max_mem_node >= memnode_cnt) { 2244 return (-3); 2245 } else { 2246 lgrp_plat_max_mem_node++; 2247 memnode_info[mnode].start = btop(start); 2248 memnode_info[mnode].end = btop(end); 2249 memnode_info[mnode].prox_domain = domain; 2250 memnode_info[mnode].device_id = device_id; 2251 memnode_info[mnode].lgrphand = node; 2252 membar_producer(); 2253 memnode_info[mnode].exists = 1; 2254 return (0); 2255 } 2256 } 2257 2258 /* 2259 * Create entry in table for node if it doesn't exist 2260 */ 2261 ASSERT(node < memnode_cnt); 2262 if (!memnode_info[node].exists) { 2263 memnode_info[node].start = btop(start); 2264 memnode_info[node].end = btop(end); 2265 memnode_info[node].prox_domain = domain; 2266 memnode_info[node].device_id = device_id; 2267 memnode_info[node].lgrphand = node; 2268 membar_producer(); 2269 memnode_info[node].exists = 1; 2270 return (0); 2271 } 2272 2273 /* 2274 * Entry already exists for this proximity domain 2275 * 2276 * There may be more than one SRAT memory entry for a domain, so we may 2277 * need to update existing start or end address for the node. 2278 */ 2279 if (memnode_info[node].prox_domain == domain) { 2280 if (btop(start) < memnode_info[node].start) 2281 memnode_info[node].start = btop(start); 2282 if (btop(end) > memnode_info[node].end) 2283 memnode_info[node].end = btop(end); 2284 return (1); 2285 } 2286 return (-2); 2287 } 2288 2289 2290 /* 2291 * Have to sort nodes by starting physical address because plat_mnode_xcheck() 2292 * assumes and expects memnodes to be sorted in ascending order by physical 2293 * address. 2294 */ 2295 static void 2296 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, 2297 cpu_node_map_t *cpu_node, int cpu_count, 2298 memnode_phys_addr_map_t *memnode_info) 2299 { 2300 boolean_t found; 2301 int i; 2302 int j; 2303 int n; 2304 boolean_t sorted; 2305 boolean_t swapped; 2306 2307 if (!lgrp_plat_node_sort_enable || node_cnt <= 1 || 2308 node_domain == NULL || memnode_info == NULL) 2309 return; 2310 2311 /* 2312 * Sorted already? 2313 */ 2314 sorted = B_TRUE; 2315 for (i = 0; i < node_cnt - 1; i++) { 2316 /* 2317 * Skip entries that don't exist 2318 */ 2319 if (!memnode_info[i].exists) 2320 continue; 2321 2322 /* 2323 * Try to find next existing entry to compare against 2324 */ 2325 found = B_FALSE; 2326 for (j = i + 1; j < node_cnt; j++) { 2327 if (memnode_info[j].exists) { 2328 found = B_TRUE; 2329 break; 2330 } 2331 } 2332 2333 /* 2334 * Done if no more existing entries to compare against 2335 */ 2336 if (found == B_FALSE) 2337 break; 2338 2339 /* 2340 * Not sorted if starting address of current entry is bigger 2341 * than starting address of next existing entry 2342 */ 2343 if (memnode_info[i].start > memnode_info[j].start) { 2344 sorted = B_FALSE; 2345 break; 2346 } 2347 } 2348 2349 /* 2350 * Don't need to sort if sorted already 2351 */ 2352 if (sorted == B_TRUE) 2353 return; 2354 2355 /* 2356 * Just use bubble sort since number of nodes is small 2357 */ 2358 n = node_cnt; 2359 do { 2360 swapped = B_FALSE; 2361 n--; 2362 for (i = 0; i < n; i++) { 2363 /* 2364 * Skip entries that don't exist 2365 */ 2366 if (!memnode_info[i].exists) 2367 continue; 2368 2369 /* 2370 * Try to find next existing entry to compare against 2371 */ 2372 found = B_FALSE; 2373 for (j = i + 1; j <= n; j++) { 2374 if (memnode_info[j].exists) { 2375 found = B_TRUE; 2376 break; 2377 } 2378 } 2379 2380 /* 2381 * Done if no more existing entries to compare against 2382 */ 2383 if (found == B_FALSE) 2384 break; 2385 2386 if (memnode_info[i].start > memnode_info[j].start) { 2387 memnode_phys_addr_map_t save_addr; 2388 node_domain_map_t save_node; 2389 2390 /* 2391 * Swap node to proxmity domain ID assignments 2392 */ 2393 bcopy(&node_domain[i], &save_node, 2394 sizeof (node_domain_map_t)); 2395 bcopy(&node_domain[j], &node_domain[i], 2396 sizeof (node_domain_map_t)); 2397 bcopy(&save_node, &node_domain[j], 2398 sizeof (node_domain_map_t)); 2399 2400 /* 2401 * Swap node to physical memory assignments 2402 */ 2403 bcopy(&memnode_info[i], &save_addr, 2404 sizeof (memnode_phys_addr_map_t)); 2405 bcopy(&memnode_info[j], &memnode_info[i], 2406 sizeof (memnode_phys_addr_map_t)); 2407 bcopy(&save_addr, &memnode_info[j], 2408 sizeof (memnode_phys_addr_map_t)); 2409 swapped = B_TRUE; 2410 } 2411 } 2412 } while (swapped == B_TRUE); 2413 2414 /* 2415 * Check to make sure that CPUs assigned to correct node IDs now since 2416 * node to proximity domain ID assignments may have been changed above 2417 */ 2418 if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1) 2419 return; 2420 for (i = 0; i < cpu_count; i++) { 2421 int node; 2422 2423 node = lgrp_plat_domain_to_node(node_domain, node_cnt, 2424 cpu_node[i].prox_domain); 2425 if (cpu_node[i].node != node) 2426 cpu_node[i].node = node; 2427 } 2428 2429 } 2430 2431 2432 /* 2433 * Return time needed to probe from current CPU to memory in given node 2434 */ 2435 static hrtime_t 2436 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries, 2437 lgrp_plat_probe_mem_config_t *probe_mem_config, 2438 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 2439 { 2440 caddr_t buf; 2441 hrtime_t elapsed; 2442 hrtime_t end; 2443 int from; 2444 int i; 2445 int ipl; 2446 hrtime_t max; 2447 hrtime_t min; 2448 hrtime_t start; 2449 extern int use_sse_pagecopy; 2450 2451 /* 2452 * Determine ID of node containing current CPU 2453 */ 2454 from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries); 2455 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 2456 2457 /* 2458 * Do common work for probing main memory 2459 */ 2460 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 2461 /* 2462 * Skip probing any nodes without memory and 2463 * set probe time to 0 2464 */ 2465 if (probe_mem_config->probe_va[to] == NULL) { 2466 lat_stats->latencies[from][to] = 0; 2467 return (0); 2468 } 2469 2470 /* 2471 * Invalidate caches once instead of once every sample 2472 * which should cut cost of probing by a lot 2473 */ 2474 probe_stats->flush_cost = gethrtime(); 2475 invalidate_cache(); 2476 probe_stats->flush_cost = gethrtime() - 2477 probe_stats->flush_cost; 2478 probe_stats->probe_cost_total += probe_stats->flush_cost; 2479 } 2480 2481 /* 2482 * Probe from current CPU to given memory using specified operation 2483 * and take specified number of samples 2484 */ 2485 max = 0; 2486 min = -1; 2487 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 2488 probe_stats->probe_cost = gethrtime(); 2489 2490 /* 2491 * Can't measure probe time if gethrtime() isn't working yet 2492 */ 2493 if (probe_stats->probe_cost == 0 && gethrtime() == 0) 2494 return (0); 2495 2496 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 2497 /* 2498 * Measure how long it takes to read vendor ID from 2499 * Northbridge 2500 */ 2501 elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 2502 } else { 2503 /* 2504 * Measure how long it takes to copy page 2505 * on top of itself 2506 */ 2507 buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 2508 2509 kpreempt_disable(); 2510 ipl = splhigh(); 2511 start = gethrtime(); 2512 if (use_sse_pagecopy) 2513 hwblkpagecopy(buf, buf); 2514 else 2515 bcopy(buf, buf, PAGESIZE); 2516 end = gethrtime(); 2517 elapsed = end - start; 2518 splx(ipl); 2519 kpreempt_enable(); 2520 } 2521 2522 probe_stats->probe_cost = gethrtime() - 2523 probe_stats->probe_cost; 2524 probe_stats->probe_cost_total += probe_stats->probe_cost; 2525 2526 if (min == -1 || elapsed < min) 2527 min = elapsed; 2528 if (elapsed > max) 2529 max = elapsed; 2530 } 2531 2532 /* 2533 * Update minimum and maximum probe times between 2534 * these two nodes 2535 */ 2536 if (min < probe_stats->probe_min[from][to] || 2537 probe_stats->probe_min[from][to] == 0) 2538 probe_stats->probe_min[from][to] = min; 2539 2540 if (max > probe_stats->probe_max[from][to]) 2541 probe_stats->probe_max[from][to] = max; 2542 2543 return (min); 2544 } 2545 2546 2547 /* 2548 * Read boot property with CPU to APIC ID array, fill in CPU to node ID 2549 * mapping table with APIC ID for each CPU (if pointer to table isn't NULL), 2550 * and return number of CPU APIC IDs. 2551 * 2552 * NOTE: This code assumes that CPU IDs are assigned in order that they appear 2553 * in in cpu_apicid_array boot property which is based on and follows 2554 * same ordering as processor list in ACPI MADT. If the code in 2555 * usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns 2556 * CPU IDs ever changes, then this code will need to change too.... 2557 */ 2558 static int 2559 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) 2560 { 2561 int boot_prop_len; 2562 char *boot_prop_name = BP_CPU_APICID_ARRAY; 2563 uint8_t cpu_apicid_array[UINT8_MAX + 1]; 2564 int i; 2565 int n; 2566 2567 /* 2568 * Check length of property value 2569 */ 2570 boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); 2571 if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array)) 2572 return (-1); 2573 2574 /* 2575 * Calculate number of entries in array and return when the system is 2576 * not very interesting for NUMA. It's not interesting for NUMA if 2577 * system has only one CPU and doesn't support CPU hotplug. 2578 */ 2579 n = boot_prop_len / sizeof (uint8_t); 2580 if (n == 1 && !plat_dr_support_cpu()) 2581 return (-2); 2582 2583 /* 2584 * Get CPU to APIC ID property value 2585 */ 2586 if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) 2587 return (-3); 2588 2589 /* 2590 * Just return number of CPU APIC IDs if CPU to node mapping table is 2591 * NULL 2592 */ 2593 if (cpu_node == NULL) { 2594 if (plat_dr_support_cpu() && n >= boot_ncpus) { 2595 return (boot_ncpus); 2596 } else { 2597 return (n); 2598 } 2599 } 2600 2601 /* 2602 * Fill in CPU to node ID mapping table with APIC ID for each CPU 2603 */ 2604 for (i = 0; i < n; i++) { 2605 /* Only add boot CPUs into the map if CPU DR is enabled. */ 2606 if (plat_dr_support_cpu() && i >= boot_ncpus) 2607 break; 2608 cpu_node[i].exists = 1; 2609 cpu_node[i].apicid = cpu_apicid_array[i]; 2610 cpu_node[i].prox_domain = UINT32_MAX; 2611 cpu_node[i].node = UINT_MAX; 2612 } 2613 2614 /* 2615 * Return number of CPUs based on number of APIC IDs 2616 */ 2617 return (i); 2618 } 2619 2620 2621 /* 2622 * Read ACPI System Locality Information Table (SLIT) to determine how far each 2623 * NUMA node is from each other 2624 */ 2625 static int 2626 lgrp_plat_process_slit(struct slit *tp, 2627 node_domain_map_t *node_domain, uint_t node_cnt, 2628 memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats) 2629 { 2630 int i; 2631 int j; 2632 int src; 2633 int dst; 2634 int localities; 2635 hrtime_t max; 2636 hrtime_t min; 2637 int retval; 2638 uint8_t *slit_entries; 2639 2640 if (tp == NULL || !lgrp_plat_slit_enable) 2641 return (1); 2642 2643 if (lat_stats == NULL) 2644 return (2); 2645 2646 localities = tp->number; 2647 2648 min = lat_stats->latency_min; 2649 max = lat_stats->latency_max; 2650 2651 /* 2652 * Fill in latency matrix based on SLIT entries 2653 */ 2654 slit_entries = tp->entry; 2655 for (i = 0; i < localities; i++) { 2656 src = lgrp_plat_domain_to_node(node_domain, 2657 node_cnt, i); 2658 if (src == -1) 2659 continue; 2660 2661 for (j = 0; j < localities; j++) { 2662 uint8_t latency; 2663 2664 dst = lgrp_plat_domain_to_node(node_domain, 2665 node_cnt, j); 2666 if (dst == -1) 2667 continue; 2668 2669 latency = slit_entries[(i * localities) + j]; 2670 lat_stats->latencies[src][dst] = latency; 2671 if (latency < min || min == -1) 2672 min = latency; 2673 if (latency > max) 2674 max = latency; 2675 } 2676 } 2677 2678 /* 2679 * Verify that latencies/distances given in SLIT look reasonable 2680 */ 2681 retval = lgrp_plat_latency_verify(memnode_info, lat_stats); 2682 2683 if (retval) { 2684 /* 2685 * Reinitialize (zero) latency table since SLIT doesn't look 2686 * right 2687 */ 2688 for (i = 0; i < localities; i++) { 2689 for (j = 0; j < localities; j++) 2690 lat_stats->latencies[i][j] = 0; 2691 } 2692 } else { 2693 /* 2694 * Update min and max latencies seen since SLIT looks valid 2695 */ 2696 lat_stats->latency_min = min; 2697 lat_stats->latency_max = max; 2698 } 2699 2700 return (retval); 2701 } 2702 2703 2704 /* 2705 * Update lgrp latencies according to information returned by ACPI _SLI method. 2706 */ 2707 static int 2708 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, 2709 uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, 2710 lgrp_plat_latency_stats_t *lat_stats) 2711 { 2712 int i; 2713 int src, dst; 2714 uint8_t latency; 2715 hrtime_t max, min; 2716 2717 if (lat_stats == NULL || sli_info == NULL || 2718 sli_cnt == 0 || domain_id >= sli_cnt) 2719 return (-1); 2720 2721 src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id); 2722 if (src == -1) { 2723 src = lgrp_plat_node_domain_update(node_domain, node_cnt, 2724 domain_id); 2725 if (src == -1) 2726 return (-1); 2727 } 2728 2729 /* 2730 * Don't update latency info if topology has been flattened to 2 levels. 2731 */ 2732 if (lgrp_plat_topo_flatten != 0) { 2733 return (0); 2734 } 2735 2736 /* 2737 * Latency information for proximity domain is ready. 2738 * TODO: support adjusting latency information at runtime. 2739 */ 2740 if (lat_stats->latencies[src][src] != 0) { 2741 return (0); 2742 } 2743 2744 /* Validate latency information. */ 2745 for (i = 0; i < sli_cnt; i++) { 2746 if (i == domain_id) { 2747 if (sli_info[i] != ACPI_SLIT_SELF_LATENCY || 2748 sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) { 2749 return (-1); 2750 } 2751 } else { 2752 if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY || 2753 sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY || 2754 sli_info[i] != sli_info[sli_cnt + i]) { 2755 return (-1); 2756 } 2757 } 2758 } 2759 2760 min = lat_stats->latency_min; 2761 max = lat_stats->latency_max; 2762 for (i = 0; i < sli_cnt; i++) { 2763 dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i); 2764 if (dst == -1) 2765 continue; 2766 2767 ASSERT(sli_info[i] == sli_info[sli_cnt + i]); 2768 2769 /* Update row in latencies matrix. */ 2770 latency = sli_info[i]; 2771 lat_stats->latencies[src][dst] = latency; 2772 if (latency < min || min == -1) 2773 min = latency; 2774 if (latency > max) 2775 max = latency; 2776 2777 /* Update column in latencies matrix. */ 2778 latency = sli_info[sli_cnt + i]; 2779 lat_stats->latencies[dst][src] = latency; 2780 if (latency < min || min == -1) 2781 min = latency; 2782 if (latency > max) 2783 max = latency; 2784 } 2785 lat_stats->latency_min = min; 2786 lat_stats->latency_max = max; 2787 2788 return (0); 2789 } 2790 2791 2792 /* 2793 * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 2794 * and memory are local to each other in the same NUMA node and return number 2795 * of nodes 2796 */ 2797 static int 2798 lgrp_plat_process_srat(struct srat *tp, struct msct *mp, 2799 uint32_t *prox_domain_min, node_domain_map_t *node_domain, 2800 cpu_node_map_t *cpu_node, int cpu_count, 2801 memnode_phys_addr_map_t *memnode_info) 2802 { 2803 struct srat_item *srat_end; 2804 int i; 2805 struct srat_item *item; 2806 int node_cnt; 2807 int proc_entry_count; 2808 int rc; 2809 2810 /* 2811 * Nothing to do when no SRAT or disabled 2812 */ 2813 if (tp == NULL || !lgrp_plat_srat_enable) 2814 return (-1); 2815 2816 /* 2817 * Try to get domain information from MSCT table. 2818 * ACPI4.0: OSPM will use information provided by the MSCT only 2819 * when the System Resource Affinity Table (SRAT) exists. 2820 */ 2821 node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min); 2822 if (node_cnt <= 0) { 2823 /* 2824 * Determine number of nodes by counting number of proximity 2825 * domains in SRAT. 2826 */ 2827 node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min); 2828 } 2829 /* 2830 * Return if number of nodes is 1 or less since don't need to read SRAT. 2831 */ 2832 if (node_cnt == 1) 2833 return (1); 2834 else if (node_cnt <= 0) 2835 return (-2); 2836 2837 /* 2838 * Walk through SRAT, examining each CPU and memory entry to determine 2839 * which CPUs and memory belong to which node. 2840 */ 2841 item = tp->list; 2842 srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2843 proc_entry_count = 0; 2844 while (item < srat_end) { 2845 uint32_t apic_id; 2846 uint32_t domain; 2847 uint64_t end; 2848 uint64_t length; 2849 uint64_t start; 2850 2851 switch (item->type) { 2852 case SRAT_PROCESSOR: /* CPU entry */ 2853 if (!(item->i.p.flags & SRAT_ENABLED) || 2854 cpu_node == NULL) 2855 break; 2856 2857 /* 2858 * Calculate domain (node) ID and fill in APIC ID to 2859 * domain/node mapping table 2860 */ 2861 domain = item->i.p.domain1; 2862 for (i = 0; i < 3; i++) { 2863 domain += item->i.p.domain2[i] << 2864 ((i + 1) * 8); 2865 } 2866 apic_id = item->i.p.apic_id; 2867 2868 rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, 2869 cpu_node, cpu_count, apic_id, domain); 2870 if (rc < 0) 2871 return (-3); 2872 else if (rc == 0) 2873 proc_entry_count++; 2874 break; 2875 2876 case SRAT_MEMORY: /* memory entry */ 2877 if (!(item->i.m.flags & SRAT_ENABLED) || 2878 memnode_info == NULL) 2879 break; 2880 2881 /* 2882 * Get domain (node) ID and fill in domain/node 2883 * to memory mapping table 2884 */ 2885 domain = item->i.m.domain; 2886 start = item->i.m.base_addr; 2887 length = item->i.m.len; 2888 end = start + length - 1; 2889 2890 /* 2891 * According to ACPI 4.0, both ENABLE and HOTPLUG flags 2892 * may be set for memory address range entries in SRAT 2893 * table which are reserved for memory hot plug. 2894 * We intersect memory address ranges in SRAT table 2895 * with memory ranges in physinstalled to filter out 2896 * memory address ranges reserved for hot plug. 2897 */ 2898 if (item->i.m.flags & SRAT_HOT_PLUG) { 2899 uint64_t rstart = UINT64_MAX; 2900 uint64_t rend = 0; 2901 struct memlist *ml; 2902 extern struct bootops *bootops; 2903 2904 memlist_read_lock(); 2905 for (ml = bootops->boot_mem->physinstalled; 2906 ml; ml = ml->ml_next) { 2907 uint64_t tstart = ml->ml_address; 2908 uint64_t tend; 2909 2910 tend = ml->ml_address + ml->ml_size; 2911 if (tstart > end || tend < start) 2912 continue; 2913 if (start > tstart) 2914 tstart = start; 2915 if (rstart > tstart) 2916 rstart = tstart; 2917 if (end < tend) 2918 tend = end; 2919 if (rend < tend) 2920 rend = tend; 2921 } 2922 memlist_read_unlock(); 2923 start = rstart; 2924 end = rend; 2925 /* Skip this entry if no memory installed. */ 2926 if (start > end) 2927 break; 2928 } 2929 2930 if (lgrp_plat_memnode_info_update(node_domain, 2931 node_cnt, memnode_info, node_cnt, 2932 start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0) 2933 return (-4); 2934 break; 2935 2936 case SRAT_X2APIC: /* x2apic CPU entry */ 2937 if (!(item->i.xp.flags & SRAT_ENABLED) || 2938 cpu_node == NULL) 2939 break; 2940 2941 /* 2942 * Calculate domain (node) ID and fill in APIC ID to 2943 * domain/node mapping table 2944 */ 2945 domain = item->i.xp.domain; 2946 apic_id = item->i.xp.x2apic_id; 2947 2948 rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, 2949 cpu_node, cpu_count, apic_id, domain); 2950 if (rc < 0) 2951 return (-3); 2952 else if (rc == 0) 2953 proc_entry_count++; 2954 break; 2955 2956 default: 2957 break; 2958 } 2959 2960 item = (struct srat_item *)((uintptr_t)item + item->len); 2961 } 2962 2963 /* 2964 * Should have seen at least as many SRAT processor entries as CPUs 2965 */ 2966 if (proc_entry_count < cpu_count) 2967 return (-5); 2968 2969 /* 2970 * Need to sort nodes by starting physical address since VM system 2971 * assumes and expects memnodes to be sorted in ascending order by 2972 * physical address 2973 */ 2974 lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count, 2975 memnode_info); 2976 2977 return (node_cnt); 2978 } 2979 2980 2981 /* 2982 * Allocate permanent memory for any temporary memory that we needed to 2983 * allocate using BOP_ALLOC() before kmem_alloc() and VM system were 2984 * initialized and copy everything from temporary to permanent memory since 2985 * temporary boot memory will eventually be released during boot 2986 */ 2987 static void 2988 lgrp_plat_release_bootstrap(void) 2989 { 2990 void *buf; 2991 size_t size; 2992 2993 if (lgrp_plat_cpu_node_nentries > 0) { 2994 size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t); 2995 buf = kmem_alloc(size, KM_SLEEP); 2996 bcopy(lgrp_plat_cpu_node, buf, size); 2997 lgrp_plat_cpu_node = buf; 2998 } 2999 } 3000 3001 3002 /* 3003 * Return number of proximity domains given in ACPI SRAT 3004 */ 3005 static int 3006 lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min) 3007 { 3008 int domain_cnt; 3009 uint32_t domain_min; 3010 struct srat_item *end; 3011 int i; 3012 struct srat_item *item; 3013 node_domain_map_t node_domain[MAX_NODES]; 3014 3015 3016 if (tp == NULL || !lgrp_plat_srat_enable) 3017 return (1); 3018 3019 /* 3020 * Walk through SRAT to find minimum proximity domain ID 3021 */ 3022 domain_min = UINT32_MAX; 3023 item = tp->list; 3024 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 3025 while (item < end) { 3026 uint32_t domain; 3027 3028 switch (item->type) { 3029 case SRAT_PROCESSOR: /* CPU entry */ 3030 if (!(item->i.p.flags & SRAT_ENABLED)) { 3031 item = (struct srat_item *)((uintptr_t)item + 3032 item->len); 3033 continue; 3034 } 3035 domain = item->i.p.domain1; 3036 for (i = 0; i < 3; i++) { 3037 domain += item->i.p.domain2[i] << 3038 ((i + 1) * 8); 3039 } 3040 break; 3041 3042 case SRAT_MEMORY: /* memory entry */ 3043 if (!(item->i.m.flags & SRAT_ENABLED)) { 3044 item = (struct srat_item *)((uintptr_t)item + 3045 item->len); 3046 continue; 3047 } 3048 domain = item->i.m.domain; 3049 break; 3050 3051 case SRAT_X2APIC: /* x2apic CPU entry */ 3052 if (!(item->i.xp.flags & SRAT_ENABLED)) { 3053 item = (struct srat_item *)((uintptr_t)item + 3054 item->len); 3055 continue; 3056 } 3057 domain = item->i.xp.domain; 3058 break; 3059 3060 default: 3061 item = (struct srat_item *)((uintptr_t)item + 3062 item->len); 3063 continue; 3064 } 3065 3066 /* 3067 * Keep track of minimum proximity domain ID 3068 */ 3069 if (domain < domain_min) 3070 domain_min = domain; 3071 3072 item = (struct srat_item *)((uintptr_t)item + item->len); 3073 } 3074 if (lgrp_plat_domain_min_enable && prox_domain_min != NULL) 3075 *prox_domain_min = domain_min; 3076 3077 /* 3078 * Walk through SRAT, examining each CPU and memory entry to determine 3079 * proximity domain ID for each. 3080 */ 3081 domain_cnt = 0; 3082 item = tp->list; 3083 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 3084 bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 3085 while (item < end) { 3086 uint32_t domain; 3087 boolean_t overflow; 3088 uint_t start; 3089 3090 switch (item->type) { 3091 case SRAT_PROCESSOR: /* CPU entry */ 3092 if (!(item->i.p.flags & SRAT_ENABLED)) { 3093 item = (struct srat_item *)((uintptr_t)item + 3094 item->len); 3095 continue; 3096 } 3097 domain = item->i.p.domain1; 3098 for (i = 0; i < 3; i++) { 3099 domain += item->i.p.domain2[i] << 3100 ((i + 1) * 8); 3101 } 3102 break; 3103 3104 case SRAT_MEMORY: /* memory entry */ 3105 if (!(item->i.m.flags & SRAT_ENABLED)) { 3106 item = (struct srat_item *)((uintptr_t)item + 3107 item->len); 3108 continue; 3109 } 3110 domain = item->i.m.domain; 3111 break; 3112 3113 case SRAT_X2APIC: /* x2apic CPU entry */ 3114 if (!(item->i.xp.flags & SRAT_ENABLED)) { 3115 item = (struct srat_item *)((uintptr_t)item + 3116 item->len); 3117 continue; 3118 } 3119 domain = item->i.xp.domain; 3120 break; 3121 3122 default: 3123 item = (struct srat_item *)((uintptr_t)item + 3124 item->len); 3125 continue; 3126 } 3127 3128 /* 3129 * Count and keep track of which proximity domain IDs seen 3130 */ 3131 start = i = domain % MAX_NODES; 3132 overflow = B_TRUE; 3133 do { 3134 /* 3135 * Create entry for proximity domain and increment 3136 * count when no entry exists where proximity domain 3137 * hashed 3138 */ 3139 if (!node_domain[i].exists) { 3140 node_domain[i].exists = 1; 3141 node_domain[i].prox_domain = domain; 3142 domain_cnt++; 3143 overflow = B_FALSE; 3144 break; 3145 } 3146 3147 /* 3148 * Nothing to do when proximity domain seen already 3149 * and its entry exists 3150 */ 3151 if (node_domain[i].prox_domain == domain) { 3152 overflow = B_FALSE; 3153 break; 3154 } 3155 3156 /* 3157 * Entry exists where proximity domain hashed, but for 3158 * different proximity domain so keep search for empty 3159 * slot to put it or matching entry whichever comes 3160 * first. 3161 */ 3162 i = (i + 1) % MAX_NODES; 3163 } while (i != start); 3164 3165 /* 3166 * Didn't find empty or matching entry which means have more 3167 * proximity domains than supported nodes (:-( 3168 */ 3169 ASSERT(overflow != B_TRUE); 3170 if (overflow == B_TRUE) 3171 return (-1); 3172 3173 item = (struct srat_item *)((uintptr_t)item + item->len); 3174 } 3175 return (domain_cnt); 3176 } 3177 3178 3179 /* 3180 * Parse domain information in ACPI Maximum System Capability Table (MSCT). 3181 * MSCT table has been verified in function process_msct() in fakebop.c. 3182 */ 3183 static int 3184 lgrp_plat_msct_domains(struct msct *tp, uint32_t *prox_domain_min) 3185 { 3186 int last_seen = 0; 3187 uint32_t proxmin = UINT32_MAX; 3188 struct msct_proximity_domain *item, *end; 3189 3190 if (tp == NULL || lgrp_plat_msct_enable == 0) 3191 return (-1); 3192 3193 if (tp->maximum_proximity_domains >= MAX_NODES) { 3194 cmn_err(CE_CONT, 3195 "?lgrp: too many proximity domains (%d), max %d supported, " 3196 "disable support of CPU/memory DR operations.", 3197 tp->maximum_proximity_domains + 1, MAX_NODES); 3198 plat_dr_disable_cpu(); 3199 plat_dr_disable_memory(); 3200 return (-1); 3201 } 3202 3203 if (prox_domain_min != NULL) { 3204 end = (void *)(tp->hdr.len + (uintptr_t)tp); 3205 for (item = (void *)((uintptr_t)tp + 3206 tp->proximity_domain_offset); item < end; 3207 item = (void *)(item->length + (uintptr_t)item)) { 3208 if (item->domain_min < proxmin) { 3209 proxmin = item->domain_min; 3210 } 3211 3212 last_seen = item->domain_max - item->domain_min + 1; 3213 /* 3214 * Break out if all proximity domains have been 3215 * processed. Some BIOSes may have unused items 3216 * at the end of MSCT table. 3217 */ 3218 if (last_seen > tp->maximum_proximity_domains) { 3219 break; 3220 } 3221 } 3222 *prox_domain_min = proxmin; 3223 } 3224 3225 return (tp->maximum_proximity_domains + 1); 3226 } 3227 3228 3229 /* 3230 * Set lgroup latencies for 2 level lgroup topology 3231 */ 3232 static void 3233 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats) 3234 { 3235 int i, j; 3236 3237 ASSERT(lat_stats != NULL); 3238 3239 if (lgrp_plat_node_cnt >= 4) 3240 cmn_err(CE_NOTE, 3241 "MPO only optimizing for local and remote\n"); 3242 for (i = 0; i < lgrp_plat_node_cnt; i++) { 3243 for (j = 0; j < lgrp_plat_node_cnt; j++) { 3244 if (i == j) 3245 lat_stats->latencies[i][j] = 2; 3246 else 3247 lat_stats->latencies[i][j] = 3; 3248 } 3249 } 3250 lat_stats->latency_min = 2; 3251 lat_stats->latency_max = 3; 3252 /* TODO: check it. */ 3253 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 3254 lgrp_plat_topo_flatten = 1; 3255 } 3256 3257 3258 /* 3259 * The following Opteron specific constants, macros, types, and routines define 3260 * PCI configuration space registers and how to read them to determine the NUMA 3261 * configuration of *supported* Opteron processors. They provide the same 3262 * information that may be gotten from the ACPI System Resource Affinity Table 3263 * (SRAT) if it exists on the machine of interest. 3264 * 3265 * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 3266 * of interest describes all of these registers and their contents. The main 3267 * registers used by this code to determine the NUMA configuration of the 3268 * machine are the node ID register for the number of NUMA nodes and the DRAM 3269 * address map registers for the physical address range of each node. 3270 * 3271 * NOTE: The format and how to determine the NUMA configuration using PCI 3272 * config space registers may change or may not be supported in future 3273 * Opteron processor families. 3274 */ 3275 3276 /* 3277 * How many bits to shift Opteron DRAM Address Map base and limit registers 3278 * to get actual value 3279 */ 3280 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 3281 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 3282 3283 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 3284 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 3285 3286 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 3287 3288 /* 3289 * Macros to derive addresses from Opteron DRAM Address Map registers 3290 */ 3291 #define OPT_DRAMADDR_HI(reg) \ 3292 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 3293 OPT_DRAMADDR_HI_LSHIFT_ADDR) 3294 3295 #define OPT_DRAMADDR_LO(reg) \ 3296 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 3297 OPT_DRAMADDR_LO_LSHIFT_ADDR) 3298 3299 #define OPT_DRAMADDR(high, low) \ 3300 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 3301 3302 /* 3303 * Bit masks defining what's in Opteron DRAM Address Map base register 3304 */ 3305 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 3306 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 3307 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 3308 3309 /* 3310 * Bit masks defining what's in Opteron DRAM Address Map limit register 3311 */ 3312 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 3313 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 3314 3315 3316 /* 3317 * Opteron Node ID register in PCI configuration space contains 3318 * number of nodes in system, etc. for Opteron K8. The following 3319 * constants and macros define its contents, structure, and access. 3320 */ 3321 3322 /* 3323 * Bit masks defining what's in Opteron Node ID register 3324 */ 3325 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 3326 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 3327 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 3328 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 3329 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 3330 3331 /* 3332 * How many bits in Opteron Node ID register to shift right to get actual value 3333 */ 3334 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 3335 3336 /* 3337 * Macros to get values from Opteron Node ID register 3338 */ 3339 #define OPT_NODE_CNT(reg) \ 3340 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 3341 3342 /* 3343 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 3344 * "in/out" instructions 3345 * 3346 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 3347 * other uses should just do MMIO to access PCI ECS. 3348 * Must enable special bit in Northbridge Configuration Register on 3349 * Greyhound for extended CF8 space access to be able to access PCI ECS 3350 * using "in/out" instructions and restore special bit after done 3351 * accessing PCI ECS. 3352 */ 3353 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 3354 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 3355 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 3356 ((((reg) >> 8) & 0xf) << 24)) 3357 3358 /* 3359 * PCI configuration space registers accessed by specifying 3360 * a bus, device, function, and offset. The following constants 3361 * define the values needed to access Opteron K8 configuration 3362 * info to determine its node topology 3363 */ 3364 3365 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 3366 3367 /* 3368 * Opteron PCI configuration space register function values 3369 */ 3370 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 3371 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 3372 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 3373 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 3374 3375 /* 3376 * PCI Configuration Space register offsets 3377 */ 3378 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 3379 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 3380 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 3381 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 3382 3383 /* 3384 * Opteron PCI Configuration Space device IDs for nodes 3385 */ 3386 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 3387 3388 3389 /* 3390 * Opteron DRAM address map gives base and limit for physical memory in a node 3391 */ 3392 typedef struct opt_dram_addr_map { 3393 uint32_t base_hi; 3394 uint32_t base_lo; 3395 uint32_t limit_hi; 3396 uint32_t limit_lo; 3397 } opt_dram_addr_map_t; 3398 3399 3400 /* 3401 * Supported AMD processor families 3402 */ 3403 #define AMD_FAMILY_HAMMER 15 3404 #define AMD_FAMILY_GREYHOUND 16 3405 3406 /* 3407 * Whether to have is_opteron() return 1 even when processor isn't supported 3408 */ 3409 uint_t is_opteron_override = 0; 3410 3411 /* 3412 * AMD processor family for current CPU 3413 */ 3414 uint_t opt_family = 0; 3415 3416 3417 /* 3418 * Determine whether we're running on a supported AMD Opteron since reading 3419 * node count and DRAM address map registers may have different format or 3420 * may not be supported across processor families 3421 */ 3422 static int 3423 is_opteron(void) 3424 { 3425 3426 if (x86_vendor != X86_VENDOR_AMD) 3427 return (0); 3428 3429 opt_family = cpuid_getfamily(CPU); 3430 if (opt_family == AMD_FAMILY_HAMMER || 3431 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 3432 return (1); 3433 else 3434 return (0); 3435 } 3436 3437 3438 /* 3439 * Determine NUMA configuration for Opteron from registers that live in PCI 3440 * configuration space 3441 */ 3442 static void 3443 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 3444 memnode_phys_addr_map_t *memnode_info) 3445 { 3446 uint_t bus; 3447 uint_t dev; 3448 struct opt_dram_addr_map dram_map[MAX_NODES]; 3449 uint_t node; 3450 uint_t node_info[MAX_NODES]; 3451 uint_t off_hi; 3452 uint_t off_lo; 3453 uint64_t nb_cfg_reg; 3454 3455 /* 3456 * Read configuration registers from PCI configuration space to 3457 * determine node information, which memory is in each node, etc. 3458 * 3459 * Write to PCI configuration space address register to specify 3460 * which configuration register to read and read/write PCI 3461 * configuration space data register to get/set contents 3462 */ 3463 bus = OPT_PCS_BUS_CONFIG; 3464 dev = OPT_PCS_DEV_NODE0; 3465 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 3466 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 3467 3468 /* 3469 * Read node ID register for node 0 to get node count 3470 */ 3471 node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 3472 OPT_PCS_OFF_NODEID); 3473 *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 3474 3475 /* 3476 * If number of nodes is more than maximum supported, then set node 3477 * count to 1 and treat system as UMA instead of NUMA. 3478 */ 3479 if (*node_cnt > MAX_NODES) { 3480 *node_cnt = 1; 3481 return; 3482 } 3483 3484 /* 3485 * For Greyhound, PCI Extended Configuration Space must be enabled to 3486 * read high DRAM address map base and limit registers 3487 */ 3488 if (opt_family == AMD_FAMILY_GREYHOUND) { 3489 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 3490 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3491 wrmsr(MSR_AMD_NB_CFG, 3492 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 3493 } 3494 3495 for (node = 0; node < *node_cnt; node++) { 3496 uint32_t base_hi; 3497 uint32_t base_lo; 3498 uint32_t limit_hi; 3499 uint32_t limit_lo; 3500 3501 /* 3502 * Read node ID register (except for node 0 which we just read) 3503 */ 3504 if (node > 0) { 3505 node_info[node] = pci_getl_func(bus, dev, 3506 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 3507 } 3508 3509 /* 3510 * Read DRAM base and limit registers which specify 3511 * physical memory range of each node 3512 */ 3513 if (opt_family != AMD_FAMILY_GREYHOUND) 3514 base_hi = 0; 3515 else { 3516 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 3517 OPT_PCS_FUNC_ADDRMAP, off_hi)); 3518 base_hi = dram_map[node].base_hi = 3519 inl(PCI_CONFDATA); 3520 } 3521 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 3522 OPT_PCS_FUNC_ADDRMAP, off_lo); 3523 3524 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 3525 mem_intrlv) 3526 *mem_intrlv = *mem_intrlv + 1; 3527 3528 off_hi += 4; /* high limit register offset */ 3529 if (opt_family != AMD_FAMILY_GREYHOUND) 3530 limit_hi = 0; 3531 else { 3532 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 3533 OPT_PCS_FUNC_ADDRMAP, off_hi)); 3534 limit_hi = dram_map[node].limit_hi = 3535 inl(PCI_CONFDATA); 3536 } 3537 3538 off_lo += 4; /* low limit register offset */ 3539 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 3540 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 3541 3542 /* 3543 * Increment device number to next node and register offsets 3544 * for DRAM base register of next node 3545 */ 3546 off_hi += 4; 3547 off_lo += 4; 3548 dev++; 3549 3550 /* 3551 * Both read and write enable bits must be enabled in DRAM 3552 * address map base register for physical memory to exist in 3553 * node 3554 */ 3555 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 3556 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 3557 /* 3558 * Mark node memory as non-existent and set start and 3559 * end addresses to be same in memnode_info[] 3560 */ 3561 memnode_info[node].exists = 0; 3562 memnode_info[node].start = memnode_info[node].end = 3563 (pfn_t)-1; 3564 continue; 3565 } 3566 3567 /* 3568 * Mark node memory as existing and remember physical address 3569 * range of each node for use later 3570 */ 3571 memnode_info[node].exists = 1; 3572 3573 memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 3574 3575 memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 3576 OPT_DRAMADDR_LO_MASK_OFF); 3577 } 3578 3579 /* 3580 * Restore PCI Extended Configuration Space enable bit 3581 */ 3582 if (opt_family == AMD_FAMILY_GREYHOUND) { 3583 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3584 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 3585 } 3586 } 3587 3588 3589 /* 3590 * Return average amount of time to read vendor ID register on Northbridge 3591 * N times on specified destination node from current CPU 3592 */ 3593 static hrtime_t 3594 opt_probe_vendor(int dest_node, int nreads) 3595 { 3596 int cnt; 3597 uint_t dev; 3598 /* LINTED: set but not used in function */ 3599 volatile uint_t dev_vendor; 3600 hrtime_t elapsed; 3601 hrtime_t end; 3602 int ipl; 3603 hrtime_t start; 3604 3605 dev = OPT_PCS_DEV_NODE0 + dest_node; 3606 kpreempt_disable(); 3607 ipl = spl8(); 3608 outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 3609 OPT_PCS_OFF_VENDOR)); 3610 start = gethrtime(); 3611 for (cnt = 0; cnt < nreads; cnt++) 3612 dev_vendor = inl(PCI_CONFDATA); 3613 end = gethrtime(); 3614 elapsed = (end - start) / nreads; 3615 splx(ipl); 3616 kpreempt_enable(); 3617 return (elapsed); 3618 } 3619