1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 /* 27 * Copyright (c) 2010, Intel Corporation. 28 * All rights reserved. 29 */ 30 31 /* 32 * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 33 * ================================================================ 34 * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 35 * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 36 * one or more CPUs and some local memory. The CPUs in each node can access 37 * the memory in the other nodes but at a higher latency than accessing their 38 * local memory. Typically, a system with only one node has Uniform Memory 39 * Access (UMA), but it may be possible to have a one node system that has 40 * some global memory outside of the node which is higher latency. 41 * 42 * Module Description 43 * ------------------ 44 * This module provides a platform interface for determining which CPUs and 45 * which memory (and how much) are in a NUMA node and how far each node is from 46 * each other. The interface is used by the Virtual Memory (VM) system and the 47 * common lgroup framework. The VM system uses the plat_*() routines to fill 48 * in its memory node (memnode) array with the physical address range spanned 49 * by each NUMA node to know which memory belongs to which node, so it can 50 * build and manage a physical page free list for each NUMA node and allocate 51 * local memory from each node as needed. The common lgroup framework uses the 52 * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 53 * to each node (leaf lgroup) and how far each node is from each other, so it 54 * can build the latency (lgroup) topology for the machine in order to optimize 55 * for locality. Also, an lgroup platform handle instead of lgroups are used 56 * in the interface with this module, so this module shouldn't need to know 57 * anything about lgroups. Instead, it just needs to know which CPUs, memory, 58 * etc. are in each NUMA node, how far each node is from each other, and to use 59 * a unique lgroup platform handle to refer to each node through the interface. 60 * 61 * Determining NUMA Configuration 62 * ------------------------------ 63 * By default, this module will try to determine the NUMA configuration of the 64 * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 65 * Locality Information Table (SLIT). The SRAT contains info to tell which 66 * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 67 * is a matrix that gives the distance between each system locality (which is 68 * a NUMA node and should correspond to proximity domains in the SRAT). For 69 * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 70 * specification. 71 * 72 * If the SRAT doesn't exist on a system with AMD Opteron processors, we 73 * examine registers in PCI configuration space to determine how many nodes are 74 * in the system and which CPUs and memory are in each node. 75 * do while booting the kernel. 76 * 77 * NOTE: Using these PCI configuration space registers to determine this 78 * locality info is not guaranteed to work or be compatible across all 79 * Opteron processor families. 80 * 81 * If the SLIT does not exist or look right, the kernel will probe to determine 82 * the distance between nodes as long as the NUMA CPU and memory configuration 83 * has been determined (see lgrp_plat_probe() for details). 84 * 85 * Data Structures 86 * --------------- 87 * The main data structures used by this code are the following: 88 * 89 * - lgrp_plat_cpu_node[] CPU to node ID mapping table indexed by 90 * CPU ID (only used for SRAT) 91 * 92 * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 93 * different nodes indexed by node ID 94 * 95 * - lgrp_plat_node_cnt Number of NUMA nodes in system for 96 * non-DR-capable systems, 97 * maximum possible number of NUMA nodes 98 * in system for DR capable systems. 99 * 100 * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 101 * table indexed by node ID (only used 102 * for SRAT) 103 * 104 * - lgrp_plat_memnode_info[] Table with physical address range for 105 * each memory node indexed by memory node 106 * ID 107 * 108 * The code is implemented to make the following always be true: 109 * 110 * lgroup platform handle == node ID == memnode ID 111 * 112 * Moreover, it allows for the proximity domain ID to be equal to all of the 113 * above as long as the proximity domains IDs are numbered from 0 to <number of 114 * nodes - 1>. This is done by hashing each proximity domain ID into the range 115 * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 116 * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 117 * and be assigned node ID N. If the proximity domain IDs aren't numbered 118 * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 119 * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 120 * to node IDs. However, the proximity domain IDs may not map to the 121 * equivalent node ID since we want to keep the node IDs numbered from 0 to 122 * <number of nodes - 1> to minimize cost of searching and potentially space. 123 * 124 * With the introduction of support of memory DR operations on x86 platforms, 125 * things get a little complicated. The addresses of hot-added memory may not 126 * be continuous with other memory connected to the same lgrp node. In other 127 * words, memory addresses may get interleaved among lgrp nodes after memory 128 * DR operations. To work around this limitation, we have extended the 129 * relationship between lgrp node and memory node from 1:1 map to 1:N map, 130 * that means there may be multiple memory nodes associated with a lgrp node 131 * after memory DR operations. 132 * 133 * To minimize the code changes to support memory DR operations, the 134 * following policies have been adopted. 135 * 1) On non-DR-capable systems, the relationship among lgroup platform handle, 136 * node ID and memnode ID is still kept as: 137 * lgroup platform handle == node ID == memnode ID 138 * 2) For memory present at boot time on DR capable platforms, the relationship 139 * is still kept as is. 140 * lgroup platform handle == node ID == memnode ID 141 * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have 142 * been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt) 143 * are reserved for memory present at boot time, and memnode IDs 144 * [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate 145 * memnode ID for hot-added memory. 146 * 4) All boot code having the assumption "node ID == memnode ID" can live as 147 * is, that's because node ID is always equal to memnode ID at boot time. 148 * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and 149 * lgrp_plat_mem_size() related logics have been enhanced to deal with 150 * the 1:N map relationship. 151 * 6) The latency probing related logics, which have the assumption 152 * "node ID == memnode ID" and may be called at run time, is disabled if 153 * memory DR operation is enabled. 154 */ 155 156 157 #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 158 #include <sys/atomic.h> 159 #include <sys/bootconf.h> 160 #include <sys/cmn_err.h> 161 #include <sys/controlregs.h> 162 #include <sys/cpupart.h> 163 #include <sys/cpuvar.h> 164 #include <sys/lgrp.h> 165 #include <sys/machsystm.h> 166 #include <sys/memlist.h> 167 #include <sys/memnode.h> 168 #include <sys/mman.h> 169 #include <sys/note.h> 170 #include <sys/pci_cfgspace.h> 171 #include <sys/pci_impl.h> 172 #include <sys/param.h> 173 #include <sys/pghw.h> 174 #include <sys/promif.h> /* for prom_printf() */ 175 #include <sys/sysmacros.h> 176 #include <sys/systm.h> 177 #include <sys/thread.h> 178 #include <sys/types.h> 179 #include <sys/var.h> 180 #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 181 #include <vm/hat_i86.h> 182 #include <vm/seg_kmem.h> 183 #include <vm/vm_dep.h> 184 185 #include <sys/acpidev.h> 186 #include "acpi_fw.h" /* for SRAT, SLIT and MSCT */ 187 188 189 #define MAX_NODES 8 190 #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 191 192 /* 193 * Constants for configuring probing 194 */ 195 #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 196 #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 197 #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 198 199 /* 200 * Flags for probing 201 */ 202 #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 203 #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 204 #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 205 206 /* 207 * Hash proximity domain ID into node to domain mapping table "mod" number of 208 * nodes to minimize span of entries used and try to have lowest numbered 209 * proximity domain be node 0 210 */ 211 #define NODE_DOMAIN_HASH(domain, node_cnt) \ 212 ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \ 213 ((domain) - lgrp_plat_prox_domain_min) % node_cnt) 214 215 /* 216 * CPU to node ID mapping structure (only used with SRAT) 217 */ 218 typedef struct cpu_node_map { 219 int exists; 220 uint_t node; 221 uint32_t apicid; 222 uint32_t prox_domain; 223 } cpu_node_map_t; 224 225 /* 226 * Latency statistics 227 */ 228 typedef struct lgrp_plat_latency_stats { 229 hrtime_t latencies[MAX_NODES][MAX_NODES]; 230 hrtime_t latency_max; 231 hrtime_t latency_min; 232 } lgrp_plat_latency_stats_t; 233 234 /* 235 * Memory configuration for probing 236 */ 237 typedef struct lgrp_plat_probe_mem_config { 238 size_t probe_memsize; /* how much memory to probe per node */ 239 caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 240 pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 241 } lgrp_plat_probe_mem_config_t; 242 243 /* 244 * Statistics kept for probing 245 */ 246 typedef struct lgrp_plat_probe_stats { 247 hrtime_t flush_cost; 248 hrtime_t probe_cost; 249 hrtime_t probe_cost_total; 250 hrtime_t probe_error_code; 251 hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 252 int probe_suspect[MAX_NODES][MAX_NODES]; 253 hrtime_t probe_max[MAX_NODES][MAX_NODES]; 254 hrtime_t probe_min[MAX_NODES][MAX_NODES]; 255 } lgrp_plat_probe_stats_t; 256 257 /* 258 * Node to proximity domain ID mapping structure (only used with SRAT) 259 */ 260 typedef struct node_domain_map { 261 int exists; 262 uint32_t prox_domain; 263 } node_domain_map_t; 264 265 /* 266 * Node ID and starting and ending page for physical memory in memory node 267 */ 268 typedef struct memnode_phys_addr_map { 269 pfn_t start; 270 pfn_t end; 271 int exists; 272 uint32_t prox_domain; 273 uint32_t device_id; 274 uint_t lgrphand; 275 } memnode_phys_addr_map_t; 276 277 /* 278 * Number of CPUs for which we got APIC IDs 279 */ 280 static int lgrp_plat_apic_ncpus = 0; 281 282 /* 283 * CPU to node ID mapping table (only used for SRAT) and its max number of 284 * entries 285 */ 286 static cpu_node_map_t *lgrp_plat_cpu_node = NULL; 287 static uint_t lgrp_plat_cpu_node_nentries = 0; 288 289 /* 290 * Latency statistics 291 */ 292 lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 293 294 /* 295 * Whether memory is interleaved across nodes causing MPO to be disabled 296 */ 297 static int lgrp_plat_mem_intrlv = 0; 298 299 /* 300 * Node ID to proximity domain ID mapping table (only used for SRAT) 301 */ 302 static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 303 304 /* 305 * Physical address range for memory in each node 306 */ 307 static memnode_phys_addr_map_t lgrp_plat_memnode_info[MAX_MEM_NODES]; 308 309 /* 310 * Statistics gotten from probing 311 */ 312 static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 313 314 /* 315 * Memory configuration for probing 316 */ 317 static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 318 319 /* 320 * Lowest proximity domain ID seen in ACPI SRAT 321 */ 322 static uint32_t lgrp_plat_prox_domain_min = UINT32_MAX; 323 324 /* 325 * Error code from processing ACPI SRAT 326 */ 327 static int lgrp_plat_srat_error = 0; 328 329 /* 330 * Error code from processing ACPI SLIT 331 */ 332 static int lgrp_plat_slit_error = 0; 333 334 /* 335 * Whether lgrp topology has been flattened to 2 levels. 336 */ 337 static int lgrp_plat_topo_flatten = 0; 338 339 340 /* 341 * Maximum memory node ID in use. 342 */ 343 static uint_t lgrp_plat_max_mem_node; 344 345 /* 346 * Allocate lgroup array statically 347 */ 348 static lgrp_t lgrp_space[NLGRP]; 349 static int nlgrps_alloc; 350 351 352 /* 353 * Enable finding and using minimum proximity domain ID when hashing 354 */ 355 int lgrp_plat_domain_min_enable = 1; 356 357 /* 358 * Maximum possible number of nodes in system 359 */ 360 uint_t lgrp_plat_node_cnt = 1; 361 362 /* 363 * Enable sorting nodes in ascending order by starting physical address 364 */ 365 int lgrp_plat_node_sort_enable = 1; 366 367 /* 368 * Configuration Parameters for Probing 369 * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 370 * operation, etc. 371 * - lgrp_plat_probe_nrounds How many rounds of probing to do 372 * - lgrp_plat_probe_nsamples Number of samples to take when probing each 373 * node 374 * - lgrp_plat_probe_nreads Number of times to read vendor ID from 375 * Northbridge for each probe 376 */ 377 uint_t lgrp_plat_probe_flags = 0; 378 int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 379 int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 380 int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 381 382 /* 383 * Enable use of ACPI System Resource Affinity Table (SRAT), System 384 * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT) 385 */ 386 int lgrp_plat_srat_enable = 1; 387 int lgrp_plat_slit_enable = 1; 388 int lgrp_plat_msct_enable = 1; 389 390 /* 391 * mnode_xwa: set to non-zero value to initiate workaround if large pages are 392 * found to be crossing memory node boundaries. The workaround will eliminate 393 * a base size page at the end of each memory node boundary to ensure that 394 * a large page with constituent pages that span more than 1 memory node 395 * can never be formed. 396 * 397 */ 398 int mnode_xwa = 1; 399 400 /* 401 * Static array to hold lgroup statistics 402 */ 403 struct lgrp_stats lgrp_stats[NLGRP]; 404 405 406 /* 407 * Forward declarations of platform interface routines 408 */ 409 void plat_build_mem_nodes(struct memlist *list); 410 411 int plat_mnode_xcheck(pfn_t pfncnt); 412 413 lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 414 415 int plat_pfn_to_mem_node(pfn_t pfn); 416 417 /* 418 * Forward declarations of lgroup platform interface routines 419 */ 420 lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 421 422 void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 423 424 lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 425 426 void lgrp_plat_init(lgrp_init_stages_t stage); 427 428 int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 429 430 int lgrp_plat_max_lgrps(void); 431 432 pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 433 lgrp_mem_query_t query); 434 435 lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 436 437 void lgrp_plat_probe(void); 438 439 lgrp_handle_t lgrp_plat_root_hand(void); 440 441 442 /* 443 * Forward declarations of local routines 444 */ 445 static int is_opteron(void); 446 447 static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 448 int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, 449 uint32_t domain); 450 451 static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 452 int cpu_node_nentries); 453 454 static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 455 int node_cnt, uint32_t domain); 456 457 static void lgrp_plat_get_numa_config(void); 458 459 static void lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, 460 lgrp_plat_latency_stats_t *lat_stats, 461 lgrp_plat_probe_stats_t *probe_stats); 462 463 static int lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, 464 lgrp_plat_latency_stats_t *lat_stats); 465 466 static void lgrp_plat_main_init(void); 467 468 static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 469 470 static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 471 int node_cnt, uint32_t domain); 472 473 static int lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, 474 int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt, 475 uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id); 476 477 static void lgrp_plat_node_sort(node_domain_map_t *node_domain, 478 int node_cnt, cpu_node_map_t *cpu_node, int cpu_count, 479 memnode_phys_addr_map_t *memnode_info); 480 481 static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 482 int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, 483 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); 484 485 static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); 486 487 static int lgrp_plat_process_slit(struct slit *tp, 488 node_domain_map_t *node_domain, uint_t node_cnt, 489 memnode_phys_addr_map_t *memnode_info, 490 lgrp_plat_latency_stats_t *lat_stats); 491 492 static int lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info, 493 uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, 494 lgrp_plat_latency_stats_t *lat_stats); 495 496 static int lgrp_plat_process_srat(struct srat *tp, struct msct *mp, 497 uint32_t *prox_domain_min, node_domain_map_t *node_domain, 498 cpu_node_map_t *cpu_node, int cpu_count, 499 memnode_phys_addr_map_t *memnode_info); 500 501 static void lgrp_plat_release_bootstrap(void); 502 503 static int lgrp_plat_srat_domains(struct srat *tp, 504 uint32_t *prox_domain_min); 505 506 static int lgrp_plat_msct_domains(struct msct *tp, 507 uint32_t *prox_domain_min); 508 509 static void lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats); 510 511 static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 512 memnode_phys_addr_map_t *memnode_info); 513 514 static hrtime_t opt_probe_vendor(int dest_node, int nreads); 515 516 517 /* 518 * PLATFORM INTERFACE ROUTINES 519 */ 520 521 /* 522 * Configure memory nodes for machines with more than one node (ie NUMA) 523 */ 524 void 525 plat_build_mem_nodes(struct memlist *list) 526 { 527 pfn_t cur_start; /* start addr of subrange */ 528 pfn_t cur_end; /* end addr of subrange */ 529 pfn_t start; /* start addr of whole range */ 530 pfn_t end; /* end addr of whole range */ 531 pgcnt_t endcnt; /* pages to sacrifice */ 532 533 /* 534 * Boot install lists are arranged <addr, len>, ... 535 */ 536 while (list) { 537 int node; 538 539 start = list->ml_address >> PAGESHIFT; 540 end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT; 541 542 if (start > physmax) { 543 list = list->ml_next; 544 continue; 545 } 546 if (end > physmax) 547 end = physmax; 548 549 /* 550 * When there is only one memnode, just add memory to memnode 551 */ 552 if (max_mem_nodes == 1) { 553 mem_node_add_slice(start, end); 554 list = list->ml_next; 555 continue; 556 } 557 558 /* 559 * mem_node_add_slice() expects to get a memory range that 560 * is within one memnode, so need to split any memory range 561 * that spans multiple memnodes into subranges that are each 562 * contained within one memnode when feeding them to 563 * mem_node_add_slice() 564 */ 565 cur_start = start; 566 do { 567 node = plat_pfn_to_mem_node(cur_start); 568 569 /* 570 * Panic if DRAM address map registers or SRAT say 571 * memory in node doesn't exist or address from 572 * boot installed memory list entry isn't in this node. 573 * This shouldn't happen and rest of code can't deal 574 * with this if it does. 575 */ 576 if (node < 0 || node >= lgrp_plat_max_mem_node || 577 !lgrp_plat_memnode_info[node].exists || 578 cur_start < lgrp_plat_memnode_info[node].start || 579 cur_start > lgrp_plat_memnode_info[node].end) { 580 cmn_err(CE_PANIC, "Don't know which memnode " 581 "to add installed memory address 0x%lx\n", 582 cur_start); 583 } 584 585 /* 586 * End of current subrange should not span memnodes 587 */ 588 cur_end = end; 589 endcnt = 0; 590 if (lgrp_plat_memnode_info[node].exists && 591 cur_end > lgrp_plat_memnode_info[node].end) { 592 cur_end = lgrp_plat_memnode_info[node].end; 593 if (mnode_xwa > 1) { 594 /* 595 * sacrifice the last page in each 596 * node to eliminate large pages 597 * that span more than 1 memory node. 598 */ 599 endcnt = 1; 600 physinstalled--; 601 } 602 } 603 604 mem_node_add_slice(cur_start, cur_end - endcnt); 605 606 /* 607 * Next subrange starts after end of current one 608 */ 609 cur_start = cur_end + 1; 610 } while (cur_end < end); 611 612 list = list->ml_next; 613 } 614 mem_node_physalign = 0; 615 mem_node_pfn_shift = 0; 616 } 617 618 619 /* 620 * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 621 * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 622 * a crossing is found and returns 0 otherwise. 623 */ 624 int 625 plat_mnode_xcheck(pfn_t pfncnt) 626 { 627 int node, prevnode = -1, basenode; 628 pfn_t ea, sa; 629 630 for (node = 0; node < lgrp_plat_max_mem_node; node++) { 631 632 if (lgrp_plat_memnode_info[node].exists == 0) 633 continue; 634 635 if (prevnode == -1) { 636 prevnode = node; 637 basenode = node; 638 continue; 639 } 640 641 /* assume x86 node pfn ranges are in increasing order */ 642 ASSERT(lgrp_plat_memnode_info[node].start > 643 lgrp_plat_memnode_info[prevnode].end); 644 645 /* 646 * continue if the starting address of node is not contiguous 647 * with the previous node. 648 */ 649 650 if (lgrp_plat_memnode_info[node].start != 651 (lgrp_plat_memnode_info[prevnode].end + 1)) { 652 basenode = node; 653 prevnode = node; 654 continue; 655 } 656 657 /* check if the starting address of node is pfncnt aligned */ 658 if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) { 659 660 /* 661 * at this point, node starts at an unaligned boundary 662 * and is contiguous with the previous node(s) to 663 * basenode. Check if there is an aligned contiguous 664 * range of length pfncnt that crosses this boundary. 665 */ 666 667 sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end, 668 pfncnt); 669 ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start), 670 pfncnt); 671 672 ASSERT((ea - sa) == pfncnt); 673 if (sa >= lgrp_plat_memnode_info[basenode].start && 674 ea <= (lgrp_plat_memnode_info[node].end + 1)) { 675 /* 676 * large page found to cross mnode boundary. 677 * Return Failure if workaround not enabled. 678 */ 679 if (mnode_xwa == 0) 680 return (1); 681 mnode_xwa++; 682 } 683 } 684 prevnode = node; 685 } 686 return (0); 687 } 688 689 690 lgrp_handle_t 691 plat_mem_node_to_lgrphand(int mnode) 692 { 693 if (max_mem_nodes == 1) 694 return (LGRP_DEFAULT_HANDLE); 695 696 ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node); 697 698 return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand)); 699 } 700 701 int 702 plat_pfn_to_mem_node(pfn_t pfn) 703 { 704 int node; 705 706 if (max_mem_nodes == 1) 707 return (0); 708 709 for (node = 0; node < lgrp_plat_max_mem_node; node++) { 710 /* 711 * Skip nodes with no memory 712 */ 713 if (!lgrp_plat_memnode_info[node].exists) 714 continue; 715 716 membar_consumer(); 717 if (pfn >= lgrp_plat_memnode_info[node].start && 718 pfn <= lgrp_plat_memnode_info[node].end) 719 return (node); 720 } 721 722 /* 723 * Didn't find memnode where this PFN lives which should never happen 724 */ 725 ASSERT(node < lgrp_plat_max_mem_node); 726 return (-1); 727 } 728 729 730 /* 731 * LGROUP PLATFORM INTERFACE ROUTINES 732 */ 733 734 /* 735 * Allocate additional space for an lgroup. 736 */ 737 lgrp_t * 738 lgrp_plat_alloc(lgrp_id_t lgrpid) 739 { 740 lgrp_t *lgrp; 741 742 lgrp = &lgrp_space[nlgrps_alloc++]; 743 if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 744 return (NULL); 745 return (lgrp); 746 } 747 748 749 /* 750 * Platform handling for (re)configuration changes 751 * 752 * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug: 753 * 1) Use cpu_lock to synchronize between lgrp_plat_config() and 754 * lgrp_plat_cpu_to_hand(). 755 * 2) Disable latency probing logic by making sure that the flag 756 * LGRP_PLAT_PROBE_ENABLE is cleared. 757 * 758 * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug: 759 * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal. 760 * 2) Only expansion to existing entries, no shrinking. 761 * 3) On writing side, DR framework ensures that lgrp_plat_config() is called 762 * in single-threaded context. And membar_producer() is used to ensure that 763 * all changes are visible to other CPUs before setting the "exists" flag. 764 * 4) On reading side, membar_consumer() after checking the "exists" flag 765 * ensures that right values are retrieved. 766 * 767 * Mechanism to protect lgrp_plat_node_domain[] at hotplug: 768 * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal. 769 * 2) On writing side, it's single-threaded and membar_producer() is used to 770 * ensure all changes are visible to other CPUs before setting the "exists" 771 * flag. 772 * 3) On reading side, membar_consumer() after checking the "exists" flag 773 * ensures that right values are retrieved. 774 */ 775 void 776 lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 777 { 778 #ifdef __xpv 779 _NOTE(ARGUNUSED(flag, arg)); 780 #else 781 int rc, node; 782 cpu_t *cp; 783 void *hdl = NULL; 784 uchar_t *sliptr = NULL; 785 uint32_t domain, apicid, slicnt = 0; 786 update_membounds_t *mp; 787 788 extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *, 789 uint32_t *, uint32_t *, uchar_t **); 790 extern void acpidev_dr_free_cpu_numa_info(void *); 791 792 /* 793 * This interface is used to support CPU/memory DR operations. 794 * Don't bother here if it's still during boot or only one lgrp node 795 * is supported. 796 */ 797 if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1) 798 return; 799 800 switch (flag) { 801 case LGRP_CONFIG_CPU_ADD: 802 cp = (cpu_t *)arg; 803 ASSERT(cp != NULL); 804 ASSERT(MUTEX_HELD(&cpu_lock)); 805 806 /* Check whether CPU already exists. */ 807 ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists); 808 if (lgrp_plat_cpu_node[cp->cpu_id].exists) { 809 cmn_err(CE_WARN, 810 "!lgrp: CPU(%d) already exists in cpu_node map.", 811 cp->cpu_id); 812 break; 813 } 814 815 /* Query CPU lgrp information. */ 816 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, 817 &slicnt, &sliptr); 818 ASSERT(rc == 0); 819 if (rc != 0) { 820 cmn_err(CE_WARN, 821 "!lgrp: failed to query lgrp info for CPU(%d).", 822 cp->cpu_id); 823 break; 824 } 825 826 /* Update node to proximity domain mapping */ 827 node = lgrp_plat_domain_to_node(lgrp_plat_node_domain, 828 lgrp_plat_node_cnt, domain); 829 if (node == -1) { 830 node = lgrp_plat_node_domain_update( 831 lgrp_plat_node_domain, lgrp_plat_node_cnt, domain); 832 ASSERT(node != -1); 833 if (node == -1) { 834 acpidev_dr_free_cpu_numa_info(hdl); 835 cmn_err(CE_WARN, "!lgrp: failed to update " 836 "node_domain map for domain(%u).", domain); 837 break; 838 } 839 } 840 841 /* Update latency information among lgrps. */ 842 if (slicnt != 0 && sliptr != NULL) { 843 if (lgrp_plat_process_sli(domain, sliptr, slicnt, 844 lgrp_plat_node_domain, lgrp_plat_node_cnt, 845 &lgrp_plat_lat_stats) != 0) { 846 cmn_err(CE_WARN, "!lgrp: failed to update " 847 "latency information for domain (%u).", 848 domain); 849 } 850 } 851 852 /* Update CPU to node mapping. */ 853 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain; 854 lgrp_plat_cpu_node[cp->cpu_id].node = node; 855 lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid; 856 lgrp_plat_cpu_node[cp->cpu_id].exists = 1; 857 lgrp_plat_apic_ncpus++; 858 859 acpidev_dr_free_cpu_numa_info(hdl); 860 break; 861 862 case LGRP_CONFIG_CPU_DEL: 863 cp = (cpu_t *)arg; 864 ASSERT(cp != NULL); 865 ASSERT(MUTEX_HELD(&cpu_lock)); 866 867 /* Check whether CPU exists. */ 868 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists); 869 if (!lgrp_plat_cpu_node[cp->cpu_id].exists) { 870 cmn_err(CE_WARN, 871 "!lgrp: CPU(%d) doesn't exist in cpu_node map.", 872 cp->cpu_id); 873 break; 874 } 875 876 /* Query CPU lgrp information. */ 877 rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, 878 NULL, NULL); 879 ASSERT(rc == 0); 880 if (rc != 0) { 881 cmn_err(CE_WARN, 882 "!lgrp: failed to query lgrp info for CPU(%d).", 883 cp->cpu_id); 884 break; 885 } 886 887 /* Update map. */ 888 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid); 889 ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain); 890 lgrp_plat_cpu_node[cp->cpu_id].exists = 0; 891 lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX; 892 lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX; 893 lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX; 894 lgrp_plat_apic_ncpus--; 895 896 acpidev_dr_free_cpu_numa_info(hdl); 897 break; 898 899 case LGRP_CONFIG_MEM_ADD: 900 mp = (update_membounds_t *)arg; 901 ASSERT(mp != NULL); 902 903 /* Update latency information among lgrps. */ 904 if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) { 905 if (lgrp_plat_process_sli(mp->u_domain, 906 mp->u_sli_ptr, mp->u_sli_cnt, 907 lgrp_plat_node_domain, lgrp_plat_node_cnt, 908 &lgrp_plat_lat_stats) != 0) { 909 cmn_err(CE_WARN, "!lgrp: failed to update " 910 "latency information for domain (%u).", 911 domain); 912 } 913 } 914 915 if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain, 916 lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes, 917 mp->u_base, mp->u_base + mp->u_length, 918 mp->u_domain, mp->u_device_id) < 0) { 919 cmn_err(CE_WARN, 920 "!lgrp: failed to update latency information for " 921 "memory (0x%" PRIx64 " - 0x%" PRIx64 ").", 922 mp->u_base, mp->u_base + mp->u_length); 923 } 924 break; 925 926 default: 927 break; 928 } 929 #endif /* __xpv */ 930 } 931 932 933 /* 934 * Return the platform handle for the lgroup containing the given CPU 935 */ 936 lgrp_handle_t 937 lgrp_plat_cpu_to_hand(processorid_t id) 938 { 939 lgrp_handle_t hand; 940 941 ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 942 943 if (lgrp_plat_node_cnt == 1) 944 return (LGRP_DEFAULT_HANDLE); 945 946 hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 947 lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries); 948 949 ASSERT(hand != (lgrp_handle_t)-1); 950 if (hand == (lgrp_handle_t)-1) 951 return (LGRP_NULL_HANDLE); 952 953 return (hand); 954 } 955 956 957 /* 958 * Platform-specific initialization of lgroups 959 */ 960 void 961 lgrp_plat_init(lgrp_init_stages_t stage) 962 { 963 #if defined(__xpv) 964 #else /* __xpv */ 965 u_longlong_t value; 966 #endif /* __xpv */ 967 968 switch (stage) { 969 case LGRP_INIT_STAGE1: 970 #if defined(__xpv) 971 /* 972 * XXPV For now, the hypervisor treats all memory equally. 973 */ 974 lgrp_plat_node_cnt = max_mem_nodes = 1; 975 #else /* __xpv */ 976 977 /* 978 * Get boot property for lgroup topology height limit 979 */ 980 if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0) 981 (void) lgrp_topo_ht_limit_set((int)value); 982 983 /* 984 * Get boot property for enabling/disabling SRAT 985 */ 986 if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0) 987 lgrp_plat_srat_enable = (int)value; 988 989 /* 990 * Get boot property for enabling/disabling SLIT 991 */ 992 if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0) 993 lgrp_plat_slit_enable = (int)value; 994 995 /* 996 * Get boot property for enabling/disabling MSCT 997 */ 998 if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0) 999 lgrp_plat_msct_enable = (int)value; 1000 1001 /* 1002 * Initialize as a UMA machine 1003 */ 1004 if (lgrp_topo_ht_limit() == 1) { 1005 lgrp_plat_node_cnt = max_mem_nodes = 1; 1006 lgrp_plat_max_mem_node = 1; 1007 return; 1008 } 1009 1010 lgrp_plat_get_numa_config(); 1011 1012 /* 1013 * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes 1014 * to support memory DR operations if memory DR is enabled. 1015 */ 1016 lgrp_plat_max_mem_node = lgrp_plat_node_cnt; 1017 if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) { 1018 max_mem_nodes = MAX_MEM_NODES_PER_LGROUP * 1019 lgrp_plat_node_cnt; 1020 ASSERT(max_mem_nodes <= MAX_MEM_NODES); 1021 } 1022 #endif /* __xpv */ 1023 break; 1024 1025 case LGRP_INIT_STAGE3: 1026 lgrp_plat_probe(); 1027 lgrp_plat_release_bootstrap(); 1028 break; 1029 1030 case LGRP_INIT_STAGE4: 1031 lgrp_plat_main_init(); 1032 break; 1033 1034 default: 1035 break; 1036 } 1037 } 1038 1039 1040 /* 1041 * Return latency between "from" and "to" lgroups 1042 * 1043 * This latency number can only be used for relative comparison 1044 * between lgroups on the running system, cannot be used across platforms, 1045 * and may not reflect the actual latency. It is platform and implementation 1046 * specific, so platform gets to decide its value. It would be nice if the 1047 * number was at least proportional to make comparisons more meaningful though. 1048 */ 1049 int 1050 lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 1051 { 1052 lgrp_handle_t src, dest; 1053 int node; 1054 1055 if (max_mem_nodes == 1) 1056 return (0); 1057 1058 /* 1059 * Return max latency for root lgroup 1060 */ 1061 if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 1062 return (lgrp_plat_lat_stats.latency_max); 1063 1064 src = from; 1065 dest = to; 1066 1067 /* 1068 * Return 0 for nodes (lgroup platform handles) out of range 1069 */ 1070 if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 1071 return (0); 1072 1073 /* 1074 * Probe from current CPU if its lgroup latencies haven't been set yet 1075 * and we are trying to get latency from current CPU to some node. 1076 * Avoid probing if CPU/memory DR is enabled. 1077 */ 1078 if (lgrp_plat_lat_stats.latencies[src][src] == 0) { 1079 /* 1080 * Latency information should be updated by lgrp_plat_config() 1081 * for DR operations. Something is wrong if reaches here. 1082 * For safety, flatten lgrp topology to two levels. 1083 */ 1084 if (plat_dr_support_cpu() || plat_dr_support_memory()) { 1085 ASSERT(lgrp_plat_lat_stats.latencies[src][src]); 1086 cmn_err(CE_WARN, 1087 "lgrp: failed to get latency information, " 1088 "fall back to two-level topology."); 1089 lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 1090 } else { 1091 node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1092 lgrp_plat_cpu_node_nentries); 1093 ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 1094 if (node == src) 1095 lgrp_plat_probe(); 1096 } 1097 } 1098 1099 return (lgrp_plat_lat_stats.latencies[src][dest]); 1100 } 1101 1102 1103 /* 1104 * Return the maximum number of lgrps supported by the platform. 1105 * Before lgrp topology is known it returns an estimate based on the number of 1106 * nodes. Once topology is known it returns: 1107 * 1) the actual maximim number of lgrps created if CPU/memory DR operations 1108 * are not suppported. 1109 * 2) the maximum possible number of lgrps if CPU/memory DR operations are 1110 * supported. 1111 */ 1112 int 1113 lgrp_plat_max_lgrps(void) 1114 { 1115 if (!lgrp_topo_initialized || plat_dr_support_cpu() || 1116 plat_dr_support_memory()) { 1117 return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1118 } else { 1119 return (lgrp_alloc_max + 1); 1120 } 1121 } 1122 1123 1124 /* 1125 * Count number of memory pages (_t) based on mnode id (_n) and query type (_t). 1126 */ 1127 #define _LGRP_PLAT_MEM_SIZE(_n, _q, _t) \ 1128 if (mem_node_config[_n].exists) { \ 1129 switch (_q) { \ 1130 case LGRP_MEM_SIZE_FREE: \ 1131 _t += MNODE_PGCNT(_n); \ 1132 break; \ 1133 case LGRP_MEM_SIZE_AVAIL: \ 1134 _t += mem_node_memlist_pages(_n, phys_avail); \ 1135 break; \ 1136 case LGRP_MEM_SIZE_INSTALL: \ 1137 _t += mem_node_memlist_pages(_n, phys_install); \ 1138 break; \ 1139 default: \ 1140 break; \ 1141 } \ 1142 } 1143 1144 /* 1145 * Return the number of free pages in an lgroup. 1146 * 1147 * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 1148 * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 1149 * number of allocatable base pagesize pages corresponding to the 1150 * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 1151 * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 1152 * memory installed, regardless of whether or not it's usable. 1153 */ 1154 pgcnt_t 1155 lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 1156 { 1157 int mnode; 1158 pgcnt_t npgs = (pgcnt_t)0; 1159 extern struct memlist *phys_avail; 1160 extern struct memlist *phys_install; 1161 1162 1163 if (plathand == LGRP_DEFAULT_HANDLE) 1164 return (lgrp_plat_mem_size_default(plathand, query)); 1165 1166 if (plathand != LGRP_NULL_HANDLE) { 1167 /* Count memory node present at boot. */ 1168 mnode = (int)plathand; 1169 ASSERT(mnode < lgrp_plat_node_cnt); 1170 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); 1171 1172 /* Count possible hot-added memory nodes. */ 1173 for (mnode = lgrp_plat_node_cnt; 1174 mnode < lgrp_plat_max_mem_node; mnode++) { 1175 if (lgrp_plat_memnode_info[mnode].lgrphand == plathand) 1176 _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); 1177 } 1178 } 1179 1180 return (npgs); 1181 } 1182 1183 1184 /* 1185 * Return the platform handle of the lgroup that contains the physical memory 1186 * corresponding to the given page frame number 1187 */ 1188 lgrp_handle_t 1189 lgrp_plat_pfn_to_hand(pfn_t pfn) 1190 { 1191 int mnode; 1192 1193 if (max_mem_nodes == 1) 1194 return (LGRP_DEFAULT_HANDLE); 1195 1196 if (pfn > physmax) 1197 return (LGRP_NULL_HANDLE); 1198 1199 mnode = plat_pfn_to_mem_node(pfn); 1200 if (mnode < 0) 1201 return (LGRP_NULL_HANDLE); 1202 1203 return (MEM_NODE_2_LGRPHAND(mnode)); 1204 } 1205 1206 1207 /* 1208 * Probe memory in each node from current CPU to determine latency topology 1209 * 1210 * The probing code will probe the vendor ID register on the Northbridge of 1211 * Opteron processors and probe memory for other processors by default. 1212 * 1213 * Since probing is inherently error prone, the code takes laps across all the 1214 * nodes probing from each node to each of the other nodes some number of 1215 * times. Furthermore, each node is probed some number of times before moving 1216 * onto the next one during each lap. The minimum latency gotten between nodes 1217 * is kept as the latency between the nodes. 1218 * 1219 * After all that, the probe times are adjusted by normalizing values that are 1220 * close to each other and local latencies are made the same. Lastly, the 1221 * latencies are verified to make sure that certain conditions are met (eg. 1222 * local < remote, latency(a, b) == latency(b, a), etc.). 1223 * 1224 * If any of the conditions aren't met, the code will export a NUMA 1225 * configuration with the local CPUs and memory given by the SRAT or PCI config 1226 * space registers and one remote memory latency since it can't tell exactly 1227 * how far each node is from each other. 1228 */ 1229 void 1230 lgrp_plat_probe(void) 1231 { 1232 int from; 1233 int i; 1234 lgrp_plat_latency_stats_t *lat_stats; 1235 boolean_t probed; 1236 hrtime_t probe_time; 1237 int to; 1238 1239 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 1240 max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 1241 return; 1242 1243 /* SRAT and SLIT should be enabled if DR operations are enabled. */ 1244 if (plat_dr_support_cpu() || plat_dr_support_memory()) 1245 return; 1246 1247 /* 1248 * Determine ID of node containing current CPU 1249 */ 1250 from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1251 lgrp_plat_cpu_node_nentries); 1252 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 1253 if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 1254 ASSERT(lgrp_plat_node_domain[from].exists); 1255 1256 /* 1257 * Don't need to probe if got times already 1258 */ 1259 lat_stats = &lgrp_plat_lat_stats; 1260 if (lat_stats->latencies[from][from] != 0) 1261 return; 1262 1263 /* 1264 * Read vendor ID in Northbridge or read and write page(s) 1265 * in each node from current CPU and remember how long it takes, 1266 * so we can build latency topology of machine later. 1267 * This should approximate the memory latency between each node. 1268 */ 1269 probed = B_FALSE; 1270 for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 1271 for (to = 0; to < lgrp_plat_node_cnt; to++) { 1272 /* 1273 * Get probe time and skip over any nodes that can't be 1274 * probed yet or don't have memory 1275 */ 1276 probe_time = lgrp_plat_probe_time(to, 1277 lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries, 1278 &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats, 1279 &lgrp_plat_probe_stats); 1280 if (probe_time == 0) 1281 continue; 1282 1283 probed = B_TRUE; 1284 1285 /* 1286 * Keep lowest probe time as latency between nodes 1287 */ 1288 if (lat_stats->latencies[from][to] == 0 || 1289 probe_time < lat_stats->latencies[from][to]) 1290 lat_stats->latencies[from][to] = probe_time; 1291 1292 /* 1293 * Update overall minimum and maximum probe times 1294 * across all nodes 1295 */ 1296 if (probe_time < lat_stats->latency_min || 1297 lat_stats->latency_min == -1) 1298 lat_stats->latency_min = probe_time; 1299 if (probe_time > lat_stats->latency_max) 1300 lat_stats->latency_max = probe_time; 1301 } 1302 } 1303 1304 /* 1305 * Bail out if weren't able to probe any nodes from current CPU 1306 */ 1307 if (probed == B_FALSE) 1308 return; 1309 1310 /* 1311 * - Fix up latencies such that local latencies are same, 1312 * latency(i, j) == latency(j, i), etc. (if possible) 1313 * 1314 * - Verify that latencies look ok 1315 * 1316 * - Fallback to just optimizing for local and remote if 1317 * latencies didn't look right 1318 */ 1319 lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats, 1320 &lgrp_plat_probe_stats); 1321 lgrp_plat_probe_stats.probe_error_code = 1322 lgrp_plat_latency_verify(lgrp_plat_memnode_info, 1323 &lgrp_plat_lat_stats); 1324 if (lgrp_plat_probe_stats.probe_error_code) 1325 lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 1326 } 1327 1328 1329 /* 1330 * Return platform handle for root lgroup 1331 */ 1332 lgrp_handle_t 1333 lgrp_plat_root_hand(void) 1334 { 1335 return (LGRP_DEFAULT_HANDLE); 1336 } 1337 1338 1339 /* 1340 * INTERNAL ROUTINES 1341 */ 1342 1343 1344 /* 1345 * Update CPU to node mapping for given CPU and proximity domain. 1346 * Return values: 1347 * - zero for success 1348 * - positive numbers for warnings 1349 * - negative numbers for errors 1350 */ 1351 static int 1352 lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, 1353 cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain) 1354 { 1355 uint_t i; 1356 int node; 1357 1358 /* 1359 * Get node number for proximity domain 1360 */ 1361 node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 1362 if (node == -1) { 1363 node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1364 domain); 1365 if (node == -1) 1366 return (-1); 1367 } 1368 1369 /* 1370 * Search for entry with given APIC ID and fill in its node and 1371 * proximity domain IDs (if they haven't been set already) 1372 */ 1373 for (i = 0; i < nentries; i++) { 1374 /* 1375 * Skip nonexistent entries and ones without matching APIC ID 1376 */ 1377 if (!cpu_node[i].exists || cpu_node[i].apicid != apicid) 1378 continue; 1379 1380 /* 1381 * Just return if entry completely and correctly filled in 1382 * already 1383 */ 1384 if (cpu_node[i].prox_domain == domain && 1385 cpu_node[i].node == node) 1386 return (1); 1387 1388 /* 1389 * It's invalid to have more than one entry with the same 1390 * local APIC ID in SRAT table. 1391 */ 1392 if (cpu_node[i].node != UINT_MAX) 1393 return (-2); 1394 1395 /* 1396 * Fill in node and proximity domain IDs 1397 */ 1398 cpu_node[i].prox_domain = domain; 1399 cpu_node[i].node = node; 1400 1401 return (0); 1402 } 1403 1404 /* 1405 * It's possible that an apicid doesn't exist in the cpu_node map due 1406 * to user limits number of CPUs powered on at boot by specifying the 1407 * boot_ncpus kernel option. 1408 */ 1409 return (2); 1410 } 1411 1412 1413 /* 1414 * Get node ID for given CPU 1415 */ 1416 static int 1417 lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 1418 int cpu_node_nentries) 1419 { 1420 processorid_t cpuid; 1421 1422 if (cp == NULL) 1423 return (-1); 1424 1425 cpuid = cp->cpu_id; 1426 if (cpuid < 0 || cpuid >= max_ncpus) 1427 return (-1); 1428 1429 /* 1430 * SRAT doesn't exist, isn't enabled, or there was an error processing 1431 * it, so return node ID for Opteron and -1 otherwise. 1432 */ 1433 if (srat_ptr == NULL || !lgrp_plat_srat_enable || 1434 lgrp_plat_srat_error) { 1435 if (is_opteron()) 1436 return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE)); 1437 return (-1); 1438 } 1439 1440 /* 1441 * Return -1 when CPU to node ID mapping entry doesn't exist for given 1442 * CPU 1443 */ 1444 if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists) 1445 return (-1); 1446 1447 return (cpu_node[cpuid].node); 1448 } 1449 1450 1451 /* 1452 * Return node number for given proximity domain/system locality 1453 */ 1454 static int 1455 lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, 1456 uint32_t domain) 1457 { 1458 uint_t node; 1459 uint_t start; 1460 1461 /* 1462 * Hash proximity domain ID into node to domain mapping table (array), 1463 * search for entry with matching proximity domain ID, and return index 1464 * of matching entry as node ID. 1465 */ 1466 node = start = NODE_DOMAIN_HASH(domain, node_cnt); 1467 do { 1468 if (node_domain[node].exists) { 1469 membar_consumer(); 1470 if (node_domain[node].prox_domain == domain) 1471 return (node); 1472 } 1473 node = (node + 1) % node_cnt; 1474 } while (node != start); 1475 return (-1); 1476 } 1477 1478 1479 /* 1480 * Get NUMA configuration of machine 1481 */ 1482 static void 1483 lgrp_plat_get_numa_config(void) 1484 { 1485 uint_t probe_op; 1486 1487 /* 1488 * Read boot property with CPU to APIC ID mapping table/array to 1489 * determine number of CPUs 1490 */ 1491 lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL); 1492 1493 /* 1494 * Determine which CPUs and memory are local to each other and number 1495 * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 1496 */ 1497 if (lgrp_plat_apic_ncpus > 0) { 1498 int retval; 1499 1500 /* Reserve enough resources if CPU DR is enabled. */ 1501 if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus) 1502 lgrp_plat_cpu_node_nentries = max_ncpus; 1503 else 1504 lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; 1505 1506 /* 1507 * Temporarily allocate boot memory to use for CPU to node 1508 * mapping since kernel memory allocator isn't alive yet 1509 */ 1510 lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops, 1511 NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t), 1512 sizeof (int)); 1513 1514 ASSERT(lgrp_plat_cpu_node != NULL); 1515 if (lgrp_plat_cpu_node) { 1516 bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries * 1517 sizeof (cpu_node_map_t)); 1518 } else { 1519 lgrp_plat_cpu_node_nentries = 0; 1520 } 1521 1522 /* 1523 * Fill in CPU to node ID mapping table with APIC ID for each 1524 * CPU 1525 */ 1526 (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); 1527 1528 retval = lgrp_plat_process_srat(srat_ptr, msct_ptr, 1529 &lgrp_plat_prox_domain_min, 1530 lgrp_plat_node_domain, lgrp_plat_cpu_node, 1531 lgrp_plat_apic_ncpus, lgrp_plat_memnode_info); 1532 if (retval <= 0) { 1533 lgrp_plat_srat_error = retval; 1534 lgrp_plat_node_cnt = 1; 1535 } else { 1536 lgrp_plat_srat_error = 0; 1537 lgrp_plat_node_cnt = retval; 1538 } 1539 } 1540 1541 /* 1542 * Try to use PCI config space registers on Opteron if there's an error 1543 * processing CPU to APIC ID mapping or SRAT 1544 */ 1545 if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && 1546 is_opteron()) 1547 opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 1548 lgrp_plat_memnode_info); 1549 1550 /* 1551 * Don't bother to setup system for multiple lgroups and only use one 1552 * memory node when memory is interleaved between any nodes or there is 1553 * only one NUMA node 1554 */ 1555 if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 1556 lgrp_plat_node_cnt = max_mem_nodes = 1; 1557 (void) lgrp_topo_ht_limit_set(1); 1558 return; 1559 } 1560 1561 /* 1562 * Leaf lgroups on x86/x64 architectures contain one physical 1563 * processor chip. Tune lgrp_expand_proc_thresh and 1564 * lgrp_expand_proc_diff so that lgrp_choose() will spread 1565 * things out aggressively. 1566 */ 1567 lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 1568 lgrp_expand_proc_diff = 0; 1569 1570 /* 1571 * There should be one memnode (physical page free list(s)) for 1572 * each node if memory DR is disabled. 1573 */ 1574 max_mem_nodes = lgrp_plat_node_cnt; 1575 1576 /* 1577 * Initialize min and max latency before reading SLIT or probing 1578 */ 1579 lgrp_plat_lat_stats.latency_min = -1; 1580 lgrp_plat_lat_stats.latency_max = 0; 1581 1582 /* 1583 * Determine how far each NUMA node is from each other by 1584 * reading ACPI System Locality Information Table (SLIT) if it 1585 * exists 1586 */ 1587 lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 1588 lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info, 1589 &lgrp_plat_lat_stats); 1590 1591 /* 1592 * Disable support of CPU/memory DR operations if multiple locality 1593 * domains exist in system and either of following is true. 1594 * 1) Failed to process SLIT table. 1595 * 2) Latency probing is enabled by user. 1596 */ 1597 if (lgrp_plat_node_cnt > 1 && 1598 (plat_dr_support_cpu() || plat_dr_support_memory())) { 1599 if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 || 1600 !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 || 1601 lgrp_plat_apic_ncpus <= 0) { 1602 cmn_err(CE_CONT, 1603 "?lgrp: failed to process ACPI SRAT/SLIT table, " 1604 "disable support of CPU/memory DR operations."); 1605 plat_dr_disable_cpu(); 1606 plat_dr_disable_memory(); 1607 } else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) { 1608 cmn_err(CE_CONT, 1609 "?lgrp: latency probing enabled by user, " 1610 "disable support of CPU/memory DR operations."); 1611 plat_dr_disable_cpu(); 1612 plat_dr_disable_memory(); 1613 } 1614 } 1615 1616 /* Done if succeeded to process SLIT table. */ 1617 if (lgrp_plat_slit_error == 0) 1618 return; 1619 1620 /* 1621 * Probe to determine latency between NUMA nodes when SLIT 1622 * doesn't exist or make sense 1623 */ 1624 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 1625 1626 /* 1627 * Specify whether to probe using vendor ID register or page copy 1628 * if hasn't been specified already or is overspecified 1629 */ 1630 probe_op = lgrp_plat_probe_flags & 1631 (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1632 1633 if (probe_op == 0 || 1634 probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 1635 lgrp_plat_probe_flags &= 1636 ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1637 if (is_opteron()) 1638 lgrp_plat_probe_flags |= 1639 LGRP_PLAT_PROBE_VENDOR; 1640 else 1641 lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 1642 } 1643 1644 /* 1645 * Probing errors can mess up the lgroup topology and 1646 * force us fall back to a 2 level lgroup topology. 1647 * Here we bound how tall the lgroup topology can grow 1648 * in hopes of avoiding any anamolies in probing from 1649 * messing up the lgroup topology by limiting the 1650 * accuracy of the latency topology. 1651 * 1652 * Assume that nodes will at least be configured in a 1653 * ring, so limit height of lgroup topology to be less 1654 * than number of nodes on a system with 4 or more 1655 * nodes 1656 */ 1657 if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 1658 lgrp_topo_ht_limit_default()) 1659 (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 1660 } 1661 1662 1663 /* 1664 * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 1665 * be considered same 1666 */ 1667 #define LGRP_LAT_TOLERANCE_SHIFT 4 1668 1669 int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 1670 1671 1672 /* 1673 * Adjust latencies between nodes to be symmetric, normalize latencies between 1674 * any nodes that are within some tolerance to be same, and make local 1675 * latencies be same 1676 */ 1677 static void 1678 lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, 1679 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 1680 { 1681 int i; 1682 int j; 1683 int k; 1684 int l; 1685 u_longlong_t max; 1686 u_longlong_t min; 1687 u_longlong_t t; 1688 u_longlong_t t1; 1689 u_longlong_t t2; 1690 const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 1691 int lat_corrected[MAX_NODES][MAX_NODES]; 1692 1693 /* 1694 * Nothing to do when this is an UMA machine or don't have args needed 1695 */ 1696 if (max_mem_nodes == 1) 1697 return; 1698 1699 ASSERT(memnode_info != NULL && lat_stats != NULL && 1700 probe_stats != NULL); 1701 1702 /* 1703 * Make sure that latencies are symmetric between any two nodes 1704 * (ie. latency(node0, node1) == latency(node1, node0)) 1705 */ 1706 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1707 if (!memnode_info[i].exists) 1708 continue; 1709 1710 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1711 if (!memnode_info[j].exists) 1712 continue; 1713 1714 t1 = lat_stats->latencies[i][j]; 1715 t2 = lat_stats->latencies[j][i]; 1716 1717 if (t1 == 0 || t2 == 0 || t1 == t2) 1718 continue; 1719 1720 /* 1721 * Latencies should be same 1722 * - Use minimum of two latencies which should be same 1723 * - Track suspect probe times not within tolerance of 1724 * min value 1725 * - Remember how much values are corrected by 1726 */ 1727 if (t1 > t2) { 1728 t = t2; 1729 probe_stats->probe_errors[i][j] += t1 - t2; 1730 if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 1731 probe_stats->probe_suspect[i][j]++; 1732 probe_stats->probe_suspect[j][i]++; 1733 } 1734 } else if (t2 > t1) { 1735 t = t1; 1736 probe_stats->probe_errors[j][i] += t2 - t1; 1737 if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 1738 probe_stats->probe_suspect[i][j]++; 1739 probe_stats->probe_suspect[j][i]++; 1740 } 1741 } 1742 1743 lat_stats->latencies[i][j] = 1744 lat_stats->latencies[j][i] = t; 1745 lgrp_config(cflag, t1, t); 1746 lgrp_config(cflag, t2, t); 1747 } 1748 } 1749 1750 /* 1751 * Keep track of which latencies get corrected 1752 */ 1753 for (i = 0; i < MAX_NODES; i++) 1754 for (j = 0; j < MAX_NODES; j++) 1755 lat_corrected[i][j] = 0; 1756 1757 /* 1758 * For every two nodes, see whether there is another pair of nodes which 1759 * are about the same distance apart and make the latencies be the same 1760 * if they are close enough together 1761 */ 1762 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1763 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1764 if (!memnode_info[j].exists) 1765 continue; 1766 /* 1767 * Pick one pair of nodes (i, j) 1768 * and get latency between them 1769 */ 1770 t1 = lat_stats->latencies[i][j]; 1771 1772 /* 1773 * Skip this pair of nodes if there isn't a latency 1774 * for it yet 1775 */ 1776 if (t1 == 0) 1777 continue; 1778 1779 for (k = 0; k < lgrp_plat_node_cnt; k++) { 1780 for (l = 0; l < lgrp_plat_node_cnt; l++) { 1781 if (!memnode_info[l].exists) 1782 continue; 1783 /* 1784 * Pick another pair of nodes (k, l) 1785 * not same as (i, j) and get latency 1786 * between them 1787 */ 1788 if (k == i && l == j) 1789 continue; 1790 1791 t2 = lat_stats->latencies[k][l]; 1792 1793 /* 1794 * Skip this pair of nodes if there 1795 * isn't a latency for it yet 1796 */ 1797 1798 if (t2 == 0) 1799 continue; 1800 1801 /* 1802 * Skip nodes (k, l) if they already 1803 * have same latency as (i, j) or 1804 * their latency isn't close enough to 1805 * be considered/made the same 1806 */ 1807 if (t1 == t2 || (t1 > t2 && t1 - t2 > 1808 t1 >> lgrp_plat_probe_lt_shift) || 1809 (t2 > t1 && t2 - t1 > 1810 t2 >> lgrp_plat_probe_lt_shift)) 1811 continue; 1812 1813 /* 1814 * Make latency(i, j) same as 1815 * latency(k, l), try to use latency 1816 * that has been adjusted already to get 1817 * more consistency (if possible), and 1818 * remember which latencies were 1819 * adjusted for next time 1820 */ 1821 if (lat_corrected[i][j]) { 1822 t = t1; 1823 lgrp_config(cflag, t2, t); 1824 t2 = t; 1825 } else if (lat_corrected[k][l]) { 1826 t = t2; 1827 lgrp_config(cflag, t1, t); 1828 t1 = t; 1829 } else { 1830 if (t1 > t2) 1831 t = t2; 1832 else 1833 t = t1; 1834 lgrp_config(cflag, t1, t); 1835 lgrp_config(cflag, t2, t); 1836 t1 = t2 = t; 1837 } 1838 1839 lat_stats->latencies[i][j] = 1840 lat_stats->latencies[k][l] = t; 1841 1842 lat_corrected[i][j] = 1843 lat_corrected[k][l] = 1; 1844 } 1845 } 1846 } 1847 } 1848 1849 /* 1850 * Local latencies should be same 1851 * - Find min and max local latencies 1852 * - Make all local latencies be minimum 1853 */ 1854 min = -1; 1855 max = 0; 1856 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1857 if (!memnode_info[i].exists) 1858 continue; 1859 t = lat_stats->latencies[i][i]; 1860 if (t == 0) 1861 continue; 1862 if (min == -1 || t < min) 1863 min = t; 1864 if (t > max) 1865 max = t; 1866 } 1867 if (min != max) { 1868 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1869 int local; 1870 1871 if (!memnode_info[i].exists) 1872 continue; 1873 1874 local = lat_stats->latencies[i][i]; 1875 if (local == 0) 1876 continue; 1877 1878 /* 1879 * Track suspect probe times that aren't within 1880 * tolerance of minimum local latency and how much 1881 * probe times are corrected by 1882 */ 1883 if (local - min > min >> lgrp_plat_probe_lt_shift) 1884 probe_stats->probe_suspect[i][i]++; 1885 1886 probe_stats->probe_errors[i][i] += local - min; 1887 1888 /* 1889 * Make local latencies be minimum 1890 */ 1891 lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 1892 lat_stats->latencies[i][i] = min; 1893 } 1894 } 1895 1896 /* 1897 * Determine max probe time again since just adjusted latencies 1898 */ 1899 lat_stats->latency_max = 0; 1900 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1901 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1902 if (!memnode_info[j].exists) 1903 continue; 1904 t = lat_stats->latencies[i][j]; 1905 if (t > lat_stats->latency_max) 1906 lat_stats->latency_max = t; 1907 } 1908 } 1909 } 1910 1911 1912 /* 1913 * Verify following about latencies between nodes: 1914 * 1915 * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 1916 * - Local latencies same 1917 * - Local < remote 1918 * - Number of latencies seen is reasonable 1919 * - Number of occurrences of a given latency should be more than 1 1920 * 1921 * Returns: 1922 * 0 Success 1923 * -1 Not symmetric 1924 * -2 Local latencies not same 1925 * -3 Local >= remote 1926 */ 1927 static int 1928 lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, 1929 lgrp_plat_latency_stats_t *lat_stats) 1930 { 1931 int i; 1932 int j; 1933 u_longlong_t t1; 1934 u_longlong_t t2; 1935 1936 ASSERT(memnode_info != NULL && lat_stats != NULL); 1937 1938 /* 1939 * Nothing to do when this is an UMA machine, lgroup topology is 1940 * limited to 2 levels, or there aren't any probe times yet 1941 */ 1942 if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 1943 lat_stats->latencies[0][0] == 0) 1944 return (0); 1945 1946 /* 1947 * Make sure that latencies are symmetric between any two nodes 1948 * (ie. latency(node0, node1) == latency(node1, node0)) 1949 */ 1950 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1951 if (!memnode_info[i].exists) 1952 continue; 1953 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1954 if (!memnode_info[j].exists) 1955 continue; 1956 t1 = lat_stats->latencies[i][j]; 1957 t2 = lat_stats->latencies[j][i]; 1958 1959 if (t1 == 0 || t2 == 0 || t1 == t2) 1960 continue; 1961 1962 return (-1); 1963 } 1964 } 1965 1966 /* 1967 * Local latencies should be same 1968 */ 1969 t1 = lat_stats->latencies[0][0]; 1970 for (i = 1; i < lgrp_plat_node_cnt; i++) { 1971 if (!memnode_info[i].exists) 1972 continue; 1973 1974 t2 = lat_stats->latencies[i][i]; 1975 if (t2 == 0) 1976 continue; 1977 1978 if (t1 == 0) { 1979 t1 = t2; 1980 continue; 1981 } 1982 1983 if (t1 != t2) 1984 return (-2); 1985 } 1986 1987 /* 1988 * Local latencies should be less than remote 1989 */ 1990 if (t1) { 1991 for (i = 0; i < lgrp_plat_node_cnt; i++) { 1992 for (j = 0; j < lgrp_plat_node_cnt; j++) { 1993 if (!memnode_info[j].exists) 1994 continue; 1995 t2 = lat_stats->latencies[i][j]; 1996 if (i == j || t2 == 0) 1997 continue; 1998 1999 if (t1 >= t2) 2000 return (-3); 2001 } 2002 } 2003 } 2004 2005 return (0); 2006 } 2007 2008 2009 /* 2010 * Platform-specific initialization 2011 */ 2012 static void 2013 lgrp_plat_main_init(void) 2014 { 2015 int curnode; 2016 int ht_limit; 2017 int i; 2018 2019 /* 2020 * Print a notice that MPO is disabled when memory is interleaved 2021 * across nodes....Would do this when it is discovered, but can't 2022 * because it happens way too early during boot.... 2023 */ 2024 if (lgrp_plat_mem_intrlv) 2025 cmn_err(CE_NOTE, 2026 "MPO disabled because memory is interleaved\n"); 2027 2028 /* 2029 * Don't bother to do any probing if it is disabled, there is only one 2030 * node, or the height of the lgroup topology less than or equal to 2 2031 */ 2032 ht_limit = lgrp_topo_ht_limit(); 2033 if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 2034 max_mem_nodes == 1 || ht_limit <= 2) { 2035 /* 2036 * Setup lgroup latencies for 2 level lgroup topology 2037 * (ie. local and remote only) if they haven't been set yet 2038 */ 2039 if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 2040 lgrp_plat_lat_stats.latency_max == 0) 2041 lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 2042 return; 2043 } 2044 2045 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 2046 /* 2047 * Should have been able to probe from CPU 0 when it was added 2048 * to lgroup hierarchy, but may not have been able to then 2049 * because it happens so early in boot that gethrtime() hasn't 2050 * been initialized. (:-( 2051 */ 2052 curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 2053 lgrp_plat_cpu_node_nentries); 2054 ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 2055 if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 2056 lgrp_plat_probe(); 2057 2058 return; 2059 } 2060 2061 /* 2062 * When probing memory, use one page for every sample to determine 2063 * lgroup topology and taking multiple samples 2064 */ 2065 if (lgrp_plat_probe_mem_config.probe_memsize == 0) 2066 lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 2067 lgrp_plat_probe_nsamples; 2068 2069 /* 2070 * Map memory in each node needed for probing to determine latency 2071 * topology 2072 */ 2073 for (i = 0; i < lgrp_plat_node_cnt; i++) { 2074 int mnode; 2075 2076 /* 2077 * Skip this node and leave its probe page NULL 2078 * if it doesn't have any memory 2079 */ 2080 mnode = i; 2081 if (!mem_node_config[mnode].exists) { 2082 lgrp_plat_probe_mem_config.probe_va[i] = NULL; 2083 continue; 2084 } 2085 2086 /* 2087 * Allocate one kernel virtual page 2088 */ 2089 lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 2090 lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 2091 if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 2092 cmn_err(CE_WARN, 2093 "lgrp_plat_main_init: couldn't allocate memory"); 2094 return; 2095 } 2096 2097 /* 2098 * Get PFN for first page in each node 2099 */ 2100 lgrp_plat_probe_mem_config.probe_pfn[i] = 2101 mem_node_config[mnode].physbase; 2102 2103 /* 2104 * Map virtual page to first page in node 2105 */ 2106 hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 2107 lgrp_plat_probe_mem_config.probe_memsize, 2108 lgrp_plat_probe_mem_config.probe_pfn[i], 2109 PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 2110 HAT_LOAD_NOCONSIST); 2111 } 2112 2113 /* 2114 * Probe from current CPU 2115 */ 2116 lgrp_plat_probe(); 2117 } 2118 2119 2120 /* 2121 * Return the number of free, allocatable, or installed 2122 * pages in an lgroup 2123 * This is a copy of the MAX_MEM_NODES == 1 version of the routine 2124 * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 2125 */ 2126 static pgcnt_t 2127 lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 2128 { 2129 _NOTE(ARGUNUSED(lgrphand)); 2130 2131 struct memlist *mlist; 2132 pgcnt_t npgs = 0; 2133 extern struct memlist *phys_avail; 2134 extern struct memlist *phys_install; 2135 2136 switch (query) { 2137 case LGRP_MEM_SIZE_FREE: 2138 return ((pgcnt_t)freemem); 2139 case LGRP_MEM_SIZE_AVAIL: 2140 memlist_read_lock(); 2141 for (mlist = phys_avail; mlist; mlist = mlist->ml_next) 2142 npgs += btop(mlist->ml_size); 2143 memlist_read_unlock(); 2144 return (npgs); 2145 case LGRP_MEM_SIZE_INSTALL: 2146 memlist_read_lock(); 2147 for (mlist = phys_install; mlist; mlist = mlist->ml_next) 2148 npgs += btop(mlist->ml_size); 2149 memlist_read_unlock(); 2150 return (npgs); 2151 default: 2152 return ((pgcnt_t)0); 2153 } 2154 } 2155 2156 2157 /* 2158 * Update node to proximity domain mappings for given domain and return node ID 2159 */ 2160 static int 2161 lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, 2162 uint32_t domain) 2163 { 2164 uint_t node; 2165 uint_t start; 2166 2167 /* 2168 * Hash proximity domain ID into node to domain mapping table (array) 2169 * and add entry for it into first non-existent or matching entry found 2170 */ 2171 node = start = NODE_DOMAIN_HASH(domain, node_cnt); 2172 do { 2173 /* 2174 * Entry doesn't exist yet, so create one for this proximity 2175 * domain and return node ID which is index into mapping table. 2176 */ 2177 if (!node_domain[node].exists) { 2178 node_domain[node].prox_domain = domain; 2179 membar_producer(); 2180 node_domain[node].exists = 1; 2181 return (node); 2182 } 2183 2184 /* 2185 * Entry exists for this proximity domain already, so just 2186 * return node ID (index into table). 2187 */ 2188 if (node_domain[node].prox_domain == domain) 2189 return (node); 2190 node = NODE_DOMAIN_HASH(node + 1, node_cnt); 2191 } while (node != start); 2192 2193 /* 2194 * Ran out of supported number of entries which shouldn't happen.... 2195 */ 2196 ASSERT(node != start); 2197 return (-1); 2198 } 2199 2200 /* 2201 * Update node memory information for given proximity domain with specified 2202 * starting and ending physical address range (and return positive numbers for 2203 * success and negative ones for errors) 2204 */ 2205 static int 2206 lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt, 2207 memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start, 2208 uint64_t end, uint32_t domain, uint32_t device_id) 2209 { 2210 int node, mnode; 2211 2212 /* 2213 * Get node number for proximity domain 2214 */ 2215 node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 2216 if (node == -1) { 2217 node = lgrp_plat_node_domain_update(node_domain, node_cnt, 2218 domain); 2219 if (node == -1) 2220 return (-1); 2221 } 2222 2223 /* 2224 * This function is called during boot if device_id is 2225 * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for 2226 * memory DR operations. 2227 */ 2228 if (device_id != ACPI_MEMNODE_DEVID_BOOT) { 2229 ASSERT(lgrp_plat_max_mem_node <= memnode_cnt); 2230 2231 for (mnode = lgrp_plat_node_cnt; 2232 mnode < lgrp_plat_max_mem_node; mnode++) { 2233 if (memnode_info[mnode].exists && 2234 memnode_info[mnode].prox_domain == domain && 2235 memnode_info[mnode].device_id == device_id) { 2236 if (btop(start) < memnode_info[mnode].start) 2237 memnode_info[mnode].start = btop(start); 2238 if (btop(end) > memnode_info[mnode].end) 2239 memnode_info[mnode].end = btop(end); 2240 return (1); 2241 } 2242 } 2243 2244 if (lgrp_plat_max_mem_node >= memnode_cnt) { 2245 return (-3); 2246 } else { 2247 lgrp_plat_max_mem_node++; 2248 memnode_info[mnode].start = btop(start); 2249 memnode_info[mnode].end = btop(end); 2250 memnode_info[mnode].prox_domain = domain; 2251 memnode_info[mnode].device_id = device_id; 2252 memnode_info[mnode].lgrphand = node; 2253 membar_producer(); 2254 memnode_info[mnode].exists = 1; 2255 return (0); 2256 } 2257 } 2258 2259 /* 2260 * Create entry in table for node if it doesn't exist 2261 */ 2262 ASSERT(node < memnode_cnt); 2263 if (!memnode_info[node].exists) { 2264 memnode_info[node].start = btop(start); 2265 memnode_info[node].end = btop(end); 2266 memnode_info[node].prox_domain = domain; 2267 memnode_info[node].device_id = device_id; 2268 memnode_info[node].lgrphand = node; 2269 membar_producer(); 2270 memnode_info[node].exists = 1; 2271 return (0); 2272 } 2273 2274 /* 2275 * Entry already exists for this proximity domain 2276 * 2277 * There may be more than one SRAT memory entry for a domain, so we may 2278 * need to update existing start or end address for the node. 2279 */ 2280 if (memnode_info[node].prox_domain == domain) { 2281 if (btop(start) < memnode_info[node].start) 2282 memnode_info[node].start = btop(start); 2283 if (btop(end) > memnode_info[node].end) 2284 memnode_info[node].end = btop(end); 2285 return (1); 2286 } 2287 return (-2); 2288 } 2289 2290 2291 /* 2292 * Have to sort nodes by starting physical address because plat_mnode_xcheck() 2293 * assumes and expects memnodes to be sorted in ascending order by physical 2294 * address. 2295 */ 2296 static void 2297 lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, 2298 cpu_node_map_t *cpu_node, int cpu_count, 2299 memnode_phys_addr_map_t *memnode_info) 2300 { 2301 boolean_t found; 2302 int i; 2303 int j; 2304 int n; 2305 boolean_t sorted; 2306 boolean_t swapped; 2307 2308 if (!lgrp_plat_node_sort_enable || node_cnt <= 1 || 2309 node_domain == NULL || memnode_info == NULL) 2310 return; 2311 2312 /* 2313 * Sorted already? 2314 */ 2315 sorted = B_TRUE; 2316 for (i = 0; i < node_cnt - 1; i++) { 2317 /* 2318 * Skip entries that don't exist 2319 */ 2320 if (!memnode_info[i].exists) 2321 continue; 2322 2323 /* 2324 * Try to find next existing entry to compare against 2325 */ 2326 found = B_FALSE; 2327 for (j = i + 1; j < node_cnt; j++) { 2328 if (memnode_info[j].exists) { 2329 found = B_TRUE; 2330 break; 2331 } 2332 } 2333 2334 /* 2335 * Done if no more existing entries to compare against 2336 */ 2337 if (found == B_FALSE) 2338 break; 2339 2340 /* 2341 * Not sorted if starting address of current entry is bigger 2342 * than starting address of next existing entry 2343 */ 2344 if (memnode_info[i].start > memnode_info[j].start) { 2345 sorted = B_FALSE; 2346 break; 2347 } 2348 } 2349 2350 /* 2351 * Don't need to sort if sorted already 2352 */ 2353 if (sorted == B_TRUE) 2354 return; 2355 2356 /* 2357 * Just use bubble sort since number of nodes is small 2358 */ 2359 n = node_cnt; 2360 do { 2361 swapped = B_FALSE; 2362 n--; 2363 for (i = 0; i < n; i++) { 2364 /* 2365 * Skip entries that don't exist 2366 */ 2367 if (!memnode_info[i].exists) 2368 continue; 2369 2370 /* 2371 * Try to find next existing entry to compare against 2372 */ 2373 found = B_FALSE; 2374 for (j = i + 1; j <= n; j++) { 2375 if (memnode_info[j].exists) { 2376 found = B_TRUE; 2377 break; 2378 } 2379 } 2380 2381 /* 2382 * Done if no more existing entries to compare against 2383 */ 2384 if (found == B_FALSE) 2385 break; 2386 2387 if (memnode_info[i].start > memnode_info[j].start) { 2388 memnode_phys_addr_map_t save_addr; 2389 node_domain_map_t save_node; 2390 2391 /* 2392 * Swap node to proxmity domain ID assignments 2393 */ 2394 bcopy(&node_domain[i], &save_node, 2395 sizeof (node_domain_map_t)); 2396 bcopy(&node_domain[j], &node_domain[i], 2397 sizeof (node_domain_map_t)); 2398 bcopy(&save_node, &node_domain[j], 2399 sizeof (node_domain_map_t)); 2400 2401 /* 2402 * Swap node to physical memory assignments 2403 */ 2404 bcopy(&memnode_info[i], &save_addr, 2405 sizeof (memnode_phys_addr_map_t)); 2406 bcopy(&memnode_info[j], &memnode_info[i], 2407 sizeof (memnode_phys_addr_map_t)); 2408 bcopy(&save_addr, &memnode_info[j], 2409 sizeof (memnode_phys_addr_map_t)); 2410 swapped = B_TRUE; 2411 } 2412 } 2413 } while (swapped == B_TRUE); 2414 2415 /* 2416 * Check to make sure that CPUs assigned to correct node IDs now since 2417 * node to proximity domain ID assignments may have been changed above 2418 */ 2419 if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1) 2420 return; 2421 for (i = 0; i < cpu_count; i++) { 2422 int node; 2423 2424 node = lgrp_plat_domain_to_node(node_domain, node_cnt, 2425 cpu_node[i].prox_domain); 2426 if (cpu_node[i].node != node) 2427 cpu_node[i].node = node; 2428 } 2429 2430 } 2431 2432 2433 /* 2434 * Return time needed to probe from current CPU to memory in given node 2435 */ 2436 static hrtime_t 2437 lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries, 2438 lgrp_plat_probe_mem_config_t *probe_mem_config, 2439 lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 2440 { 2441 caddr_t buf; 2442 hrtime_t elapsed; 2443 hrtime_t end; 2444 int from; 2445 int i; 2446 int ipl; 2447 hrtime_t max; 2448 hrtime_t min; 2449 hrtime_t start; 2450 extern int use_sse_pagecopy; 2451 2452 /* 2453 * Determine ID of node containing current CPU 2454 */ 2455 from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries); 2456 ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 2457 2458 /* 2459 * Do common work for probing main memory 2460 */ 2461 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 2462 /* 2463 * Skip probing any nodes without memory and 2464 * set probe time to 0 2465 */ 2466 if (probe_mem_config->probe_va[to] == NULL) { 2467 lat_stats->latencies[from][to] = 0; 2468 return (0); 2469 } 2470 2471 /* 2472 * Invalidate caches once instead of once every sample 2473 * which should cut cost of probing by a lot 2474 */ 2475 probe_stats->flush_cost = gethrtime(); 2476 invalidate_cache(); 2477 probe_stats->flush_cost = gethrtime() - 2478 probe_stats->flush_cost; 2479 probe_stats->probe_cost_total += probe_stats->flush_cost; 2480 } 2481 2482 /* 2483 * Probe from current CPU to given memory using specified operation 2484 * and take specified number of samples 2485 */ 2486 max = 0; 2487 min = -1; 2488 for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 2489 probe_stats->probe_cost = gethrtime(); 2490 2491 /* 2492 * Can't measure probe time if gethrtime() isn't working yet 2493 */ 2494 if (probe_stats->probe_cost == 0 && gethrtime() == 0) 2495 return (0); 2496 2497 if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 2498 /* 2499 * Measure how long it takes to read vendor ID from 2500 * Northbridge 2501 */ 2502 elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 2503 } else { 2504 /* 2505 * Measure how long it takes to copy page 2506 * on top of itself 2507 */ 2508 buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 2509 2510 kpreempt_disable(); 2511 ipl = splhigh(); 2512 start = gethrtime(); 2513 if (use_sse_pagecopy) 2514 hwblkpagecopy(buf, buf); 2515 else 2516 bcopy(buf, buf, PAGESIZE); 2517 end = gethrtime(); 2518 elapsed = end - start; 2519 splx(ipl); 2520 kpreempt_enable(); 2521 } 2522 2523 probe_stats->probe_cost = gethrtime() - 2524 probe_stats->probe_cost; 2525 probe_stats->probe_cost_total += probe_stats->probe_cost; 2526 2527 if (min == -1 || elapsed < min) 2528 min = elapsed; 2529 if (elapsed > max) 2530 max = elapsed; 2531 } 2532 2533 /* 2534 * Update minimum and maximum probe times between 2535 * these two nodes 2536 */ 2537 if (min < probe_stats->probe_min[from][to] || 2538 probe_stats->probe_min[from][to] == 0) 2539 probe_stats->probe_min[from][to] = min; 2540 2541 if (max > probe_stats->probe_max[from][to]) 2542 probe_stats->probe_max[from][to] = max; 2543 2544 return (min); 2545 } 2546 2547 2548 /* 2549 * Read boot property with CPU to APIC ID array, fill in CPU to node ID 2550 * mapping table with APIC ID for each CPU (if pointer to table isn't NULL), 2551 * and return number of CPU APIC IDs. 2552 * 2553 * NOTE: This code assumes that CPU IDs are assigned in order that they appear 2554 * in in cpu_apicid_array boot property which is based on and follows 2555 * same ordering as processor list in ACPI MADT. If the code in 2556 * usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns 2557 * CPU IDs ever changes, then this code will need to change too.... 2558 */ 2559 static int 2560 lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) 2561 { 2562 int boot_prop_len; 2563 char *boot_prop_name = BP_CPU_APICID_ARRAY; 2564 uint8_t cpu_apicid_array[UINT8_MAX + 1]; 2565 int i; 2566 int n; 2567 2568 /* 2569 * Check length of property value 2570 */ 2571 boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); 2572 if (boot_prop_len <= 0 || boot_prop_len > sizeof (cpu_apicid_array)) 2573 return (-1); 2574 2575 /* 2576 * Calculate number of entries in array and return when the system is 2577 * not very interesting for NUMA. It's not interesting for NUMA if 2578 * system has only one CPU and doesn't support CPU hotplug. 2579 */ 2580 n = boot_prop_len / sizeof (uint8_t); 2581 if (n == 1 && !plat_dr_support_cpu()) 2582 return (-2); 2583 2584 /* 2585 * Get CPU to APIC ID property value 2586 */ 2587 if (BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) 2588 return (-3); 2589 2590 /* 2591 * Just return number of CPU APIC IDs if CPU to node mapping table is 2592 * NULL 2593 */ 2594 if (cpu_node == NULL) { 2595 if (plat_dr_support_cpu() && n >= boot_ncpus) { 2596 return (boot_ncpus); 2597 } else { 2598 return (n); 2599 } 2600 } 2601 2602 /* 2603 * Fill in CPU to node ID mapping table with APIC ID for each CPU 2604 */ 2605 for (i = 0; i < n; i++) { 2606 /* Only add boot CPUs into the map if CPU DR is enabled. */ 2607 if (plat_dr_support_cpu() && i >= boot_ncpus) 2608 break; 2609 cpu_node[i].exists = 1; 2610 cpu_node[i].apicid = cpu_apicid_array[i]; 2611 cpu_node[i].prox_domain = UINT32_MAX; 2612 cpu_node[i].node = UINT_MAX; 2613 } 2614 2615 /* 2616 * Return number of CPUs based on number of APIC IDs 2617 */ 2618 return (i); 2619 } 2620 2621 2622 /* 2623 * Read ACPI System Locality Information Table (SLIT) to determine how far each 2624 * NUMA node is from each other 2625 */ 2626 static int 2627 lgrp_plat_process_slit(struct slit *tp, 2628 node_domain_map_t *node_domain, uint_t node_cnt, 2629 memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats) 2630 { 2631 int i; 2632 int j; 2633 int src; 2634 int dst; 2635 int localities; 2636 hrtime_t max; 2637 hrtime_t min; 2638 int retval; 2639 uint8_t *slit_entries; 2640 2641 if (tp == NULL || !lgrp_plat_slit_enable) 2642 return (1); 2643 2644 if (lat_stats == NULL) 2645 return (2); 2646 2647 localities = tp->number; 2648 2649 min = lat_stats->latency_min; 2650 max = lat_stats->latency_max; 2651 2652 /* 2653 * Fill in latency matrix based on SLIT entries 2654 */ 2655 slit_entries = tp->entry; 2656 for (i = 0; i < localities; i++) { 2657 src = lgrp_plat_domain_to_node(node_domain, 2658 node_cnt, i); 2659 if (src == -1) 2660 continue; 2661 2662 for (j = 0; j < localities; j++) { 2663 uint8_t latency; 2664 2665 dst = lgrp_plat_domain_to_node(node_domain, 2666 node_cnt, j); 2667 if (dst == -1) 2668 continue; 2669 2670 latency = slit_entries[(i * localities) + j]; 2671 lat_stats->latencies[src][dst] = latency; 2672 if (latency < min || min == -1) 2673 min = latency; 2674 if (latency > max) 2675 max = latency; 2676 } 2677 } 2678 2679 /* 2680 * Verify that latencies/distances given in SLIT look reasonable 2681 */ 2682 retval = lgrp_plat_latency_verify(memnode_info, lat_stats); 2683 2684 if (retval) { 2685 /* 2686 * Reinitialize (zero) latency table since SLIT doesn't look 2687 * right 2688 */ 2689 for (i = 0; i < localities; i++) { 2690 for (j = 0; j < localities; j++) 2691 lat_stats->latencies[i][j] = 0; 2692 } 2693 } else { 2694 /* 2695 * Update min and max latencies seen since SLIT looks valid 2696 */ 2697 lat_stats->latency_min = min; 2698 lat_stats->latency_max = max; 2699 } 2700 2701 return (retval); 2702 } 2703 2704 2705 /* 2706 * Update lgrp latencies according to information returned by ACPI _SLI method. 2707 */ 2708 static int 2709 lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, 2710 uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, 2711 lgrp_plat_latency_stats_t *lat_stats) 2712 { 2713 int i; 2714 int src, dst; 2715 uint8_t latency; 2716 hrtime_t max, min; 2717 2718 if (lat_stats == NULL || sli_info == NULL || 2719 sli_cnt == 0 || domain_id >= sli_cnt) 2720 return (-1); 2721 2722 src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id); 2723 if (src == -1) { 2724 src = lgrp_plat_node_domain_update(node_domain, node_cnt, 2725 domain_id); 2726 if (src == -1) 2727 return (-1); 2728 } 2729 2730 /* 2731 * Don't update latency info if topology has been flattened to 2 levels. 2732 */ 2733 if (lgrp_plat_topo_flatten != 0) { 2734 return (0); 2735 } 2736 2737 /* 2738 * Latency information for proximity domain is ready. 2739 * TODO: support adjusting latency information at runtime. 2740 */ 2741 if (lat_stats->latencies[src][src] != 0) { 2742 return (0); 2743 } 2744 2745 /* Validate latency information. */ 2746 for (i = 0; i < sli_cnt; i++) { 2747 if (i == domain_id) { 2748 if (sli_info[i] != ACPI_SLIT_SELF_LATENCY || 2749 sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) { 2750 return (-1); 2751 } 2752 } else { 2753 if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY || 2754 sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY || 2755 sli_info[i] != sli_info[sli_cnt + i]) { 2756 return (-1); 2757 } 2758 } 2759 } 2760 2761 min = lat_stats->latency_min; 2762 max = lat_stats->latency_max; 2763 for (i = 0; i < sli_cnt; i++) { 2764 dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i); 2765 if (dst == -1) 2766 continue; 2767 2768 ASSERT(sli_info[i] == sli_info[sli_cnt + i]); 2769 2770 /* Update row in latencies matrix. */ 2771 latency = sli_info[i]; 2772 lat_stats->latencies[src][dst] = latency; 2773 if (latency < min || min == -1) 2774 min = latency; 2775 if (latency > max) 2776 max = latency; 2777 2778 /* Update column in latencies matrix. */ 2779 latency = sli_info[sli_cnt + i]; 2780 lat_stats->latencies[dst][src] = latency; 2781 if (latency < min || min == -1) 2782 min = latency; 2783 if (latency > max) 2784 max = latency; 2785 } 2786 lat_stats->latency_min = min; 2787 lat_stats->latency_max = max; 2788 2789 return (0); 2790 } 2791 2792 2793 /* 2794 * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 2795 * and memory are local to each other in the same NUMA node and return number 2796 * of nodes 2797 */ 2798 static int 2799 lgrp_plat_process_srat(struct srat *tp, struct msct *mp, 2800 uint32_t *prox_domain_min, node_domain_map_t *node_domain, 2801 cpu_node_map_t *cpu_node, int cpu_count, 2802 memnode_phys_addr_map_t *memnode_info) 2803 { 2804 struct srat_item *srat_end; 2805 int i; 2806 struct srat_item *item; 2807 int node_cnt; 2808 int proc_entry_count; 2809 int rc; 2810 2811 /* 2812 * Nothing to do when no SRAT or disabled 2813 */ 2814 if (tp == NULL || !lgrp_plat_srat_enable) 2815 return (-1); 2816 2817 /* 2818 * Try to get domain information from MSCT table. 2819 * ACPI4.0: OSPM will use information provided by the MSCT only 2820 * when the System Resource Affinity Table (SRAT) exists. 2821 */ 2822 node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min); 2823 if (node_cnt <= 0) { 2824 /* 2825 * Determine number of nodes by counting number of proximity 2826 * domains in SRAT. 2827 */ 2828 node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min); 2829 } 2830 /* 2831 * Return if number of nodes is 1 or less since don't need to read SRAT. 2832 */ 2833 if (node_cnt == 1) 2834 return (1); 2835 else if (node_cnt <= 0) 2836 return (-2); 2837 2838 /* 2839 * Walk through SRAT, examining each CPU and memory entry to determine 2840 * which CPUs and memory belong to which node. 2841 */ 2842 item = tp->list; 2843 srat_end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 2844 proc_entry_count = 0; 2845 while (item < srat_end) { 2846 uint32_t apic_id; 2847 uint32_t domain; 2848 uint64_t end; 2849 uint64_t length; 2850 uint64_t start; 2851 2852 switch (item->type) { 2853 case SRAT_PROCESSOR: /* CPU entry */ 2854 if (!(item->i.p.flags & SRAT_ENABLED) || 2855 cpu_node == NULL) 2856 break; 2857 2858 /* 2859 * Calculate domain (node) ID and fill in APIC ID to 2860 * domain/node mapping table 2861 */ 2862 domain = item->i.p.domain1; 2863 for (i = 0; i < 3; i++) { 2864 domain += item->i.p.domain2[i] << 2865 ((i + 1) * 8); 2866 } 2867 apic_id = item->i.p.apic_id; 2868 2869 rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, 2870 cpu_node, cpu_count, apic_id, domain); 2871 if (rc < 0) 2872 return (-3); 2873 else if (rc == 0) 2874 proc_entry_count++; 2875 break; 2876 2877 case SRAT_MEMORY: /* memory entry */ 2878 if (!(item->i.m.flags & SRAT_ENABLED) || 2879 memnode_info == NULL) 2880 break; 2881 2882 /* 2883 * Get domain (node) ID and fill in domain/node 2884 * to memory mapping table 2885 */ 2886 domain = item->i.m.domain; 2887 start = item->i.m.base_addr; 2888 length = item->i.m.len; 2889 end = start + length - 1; 2890 2891 /* 2892 * According to ACPI 4.0, both ENABLE and HOTPLUG flags 2893 * may be set for memory address range entries in SRAT 2894 * table which are reserved for memory hot plug. 2895 * We intersect memory address ranges in SRAT table 2896 * with memory ranges in physinstalled to filter out 2897 * memory address ranges reserved for hot plug. 2898 */ 2899 if (item->i.m.flags & SRAT_HOT_PLUG) { 2900 uint64_t rstart = UINT64_MAX; 2901 uint64_t rend = 0; 2902 struct memlist *ml; 2903 extern struct bootops *bootops; 2904 2905 memlist_read_lock(); 2906 for (ml = bootops->boot_mem->physinstalled; 2907 ml; ml = ml->ml_next) { 2908 uint64_t tstart = ml->ml_address; 2909 uint64_t tend; 2910 2911 tend = ml->ml_address + ml->ml_size; 2912 if (tstart > end || tend < start) 2913 continue; 2914 if (start > tstart) 2915 tstart = start; 2916 if (rstart > tstart) 2917 rstart = tstart; 2918 if (end < tend) 2919 tend = end; 2920 if (rend < tend) 2921 rend = tend; 2922 } 2923 memlist_read_unlock(); 2924 start = rstart; 2925 end = rend; 2926 /* Skip this entry if no memory installed. */ 2927 if (start > end) 2928 break; 2929 } 2930 2931 if (lgrp_plat_memnode_info_update(node_domain, 2932 node_cnt, memnode_info, node_cnt, 2933 start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0) 2934 return (-4); 2935 break; 2936 2937 case SRAT_X2APIC: /* x2apic CPU entry */ 2938 if (!(item->i.xp.flags & SRAT_ENABLED) || 2939 cpu_node == NULL) 2940 break; 2941 2942 /* 2943 * Calculate domain (node) ID and fill in APIC ID to 2944 * domain/node mapping table 2945 */ 2946 domain = item->i.xp.domain; 2947 apic_id = item->i.xp.x2apic_id; 2948 2949 rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, 2950 cpu_node, cpu_count, apic_id, domain); 2951 if (rc < 0) 2952 return (-3); 2953 else if (rc == 0) 2954 proc_entry_count++; 2955 break; 2956 2957 default: 2958 break; 2959 } 2960 2961 item = (struct srat_item *)((uintptr_t)item + item->len); 2962 } 2963 2964 /* 2965 * Should have seen at least as many SRAT processor entries as CPUs 2966 */ 2967 if (proc_entry_count < cpu_count) 2968 return (-5); 2969 2970 /* 2971 * Need to sort nodes by starting physical address since VM system 2972 * assumes and expects memnodes to be sorted in ascending order by 2973 * physical address 2974 */ 2975 lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count, 2976 memnode_info); 2977 2978 return (node_cnt); 2979 } 2980 2981 2982 /* 2983 * Allocate permanent memory for any temporary memory that we needed to 2984 * allocate using BOP_ALLOC() before kmem_alloc() and VM system were 2985 * initialized and copy everything from temporary to permanent memory since 2986 * temporary boot memory will eventually be released during boot 2987 */ 2988 static void 2989 lgrp_plat_release_bootstrap(void) 2990 { 2991 void *buf; 2992 size_t size; 2993 2994 if (lgrp_plat_cpu_node_nentries > 0) { 2995 size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t); 2996 buf = kmem_alloc(size, KM_SLEEP); 2997 bcopy(lgrp_plat_cpu_node, buf, size); 2998 lgrp_plat_cpu_node = buf; 2999 } 3000 } 3001 3002 3003 /* 3004 * Return number of proximity domains given in ACPI SRAT 3005 */ 3006 static int 3007 lgrp_plat_srat_domains(struct srat *tp, uint32_t *prox_domain_min) 3008 { 3009 int domain_cnt; 3010 uint32_t domain_min; 3011 struct srat_item *end; 3012 int i; 3013 struct srat_item *item; 3014 node_domain_map_t node_domain[MAX_NODES]; 3015 3016 3017 if (tp == NULL || !lgrp_plat_srat_enable) 3018 return (1); 3019 3020 /* 3021 * Walk through SRAT to find minimum proximity domain ID 3022 */ 3023 domain_min = UINT32_MAX; 3024 item = tp->list; 3025 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 3026 while (item < end) { 3027 uint32_t domain; 3028 3029 switch (item->type) { 3030 case SRAT_PROCESSOR: /* CPU entry */ 3031 if (!(item->i.p.flags & SRAT_ENABLED)) { 3032 item = (struct srat_item *)((uintptr_t)item + 3033 item->len); 3034 continue; 3035 } 3036 domain = item->i.p.domain1; 3037 for (i = 0; i < 3; i++) { 3038 domain += item->i.p.domain2[i] << 3039 ((i + 1) * 8); 3040 } 3041 break; 3042 3043 case SRAT_MEMORY: /* memory entry */ 3044 if (!(item->i.m.flags & SRAT_ENABLED)) { 3045 item = (struct srat_item *)((uintptr_t)item + 3046 item->len); 3047 continue; 3048 } 3049 domain = item->i.m.domain; 3050 break; 3051 3052 case SRAT_X2APIC: /* x2apic CPU entry */ 3053 if (!(item->i.xp.flags & SRAT_ENABLED)) { 3054 item = (struct srat_item *)((uintptr_t)item + 3055 item->len); 3056 continue; 3057 } 3058 domain = item->i.xp.domain; 3059 break; 3060 3061 default: 3062 item = (struct srat_item *)((uintptr_t)item + 3063 item->len); 3064 continue; 3065 } 3066 3067 /* 3068 * Keep track of minimum proximity domain ID 3069 */ 3070 if (domain < domain_min) 3071 domain_min = domain; 3072 3073 item = (struct srat_item *)((uintptr_t)item + item->len); 3074 } 3075 if (lgrp_plat_domain_min_enable && prox_domain_min != NULL) 3076 *prox_domain_min = domain_min; 3077 3078 /* 3079 * Walk through SRAT, examining each CPU and memory entry to determine 3080 * proximity domain ID for each. 3081 */ 3082 domain_cnt = 0; 3083 item = tp->list; 3084 end = (struct srat_item *)(tp->hdr.len + (uintptr_t)tp); 3085 bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 3086 while (item < end) { 3087 uint32_t domain; 3088 boolean_t overflow; 3089 uint_t start; 3090 3091 switch (item->type) { 3092 case SRAT_PROCESSOR: /* CPU entry */ 3093 if (!(item->i.p.flags & SRAT_ENABLED)) { 3094 item = (struct srat_item *)((uintptr_t)item + 3095 item->len); 3096 continue; 3097 } 3098 domain = item->i.p.domain1; 3099 for (i = 0; i < 3; i++) { 3100 domain += item->i.p.domain2[i] << 3101 ((i + 1) * 8); 3102 } 3103 break; 3104 3105 case SRAT_MEMORY: /* memory entry */ 3106 if (!(item->i.m.flags & SRAT_ENABLED)) { 3107 item = (struct srat_item *)((uintptr_t)item + 3108 item->len); 3109 continue; 3110 } 3111 domain = item->i.m.domain; 3112 break; 3113 3114 case SRAT_X2APIC: /* x2apic CPU entry */ 3115 if (!(item->i.xp.flags & SRAT_ENABLED)) { 3116 item = (struct srat_item *)((uintptr_t)item + 3117 item->len); 3118 continue; 3119 } 3120 domain = item->i.xp.domain; 3121 break; 3122 3123 default: 3124 item = (struct srat_item *)((uintptr_t)item + 3125 item->len); 3126 continue; 3127 } 3128 3129 /* 3130 * Count and keep track of which proximity domain IDs seen 3131 */ 3132 start = i = domain % MAX_NODES; 3133 overflow = B_TRUE; 3134 do { 3135 /* 3136 * Create entry for proximity domain and increment 3137 * count when no entry exists where proximity domain 3138 * hashed 3139 */ 3140 if (!node_domain[i].exists) { 3141 node_domain[i].exists = 1; 3142 node_domain[i].prox_domain = domain; 3143 domain_cnt++; 3144 overflow = B_FALSE; 3145 break; 3146 } 3147 3148 /* 3149 * Nothing to do when proximity domain seen already 3150 * and its entry exists 3151 */ 3152 if (node_domain[i].prox_domain == domain) { 3153 overflow = B_FALSE; 3154 break; 3155 } 3156 3157 /* 3158 * Entry exists where proximity domain hashed, but for 3159 * different proximity domain so keep search for empty 3160 * slot to put it or matching entry whichever comes 3161 * first. 3162 */ 3163 i = (i + 1) % MAX_NODES; 3164 } while (i != start); 3165 3166 /* 3167 * Didn't find empty or matching entry which means have more 3168 * proximity domains than supported nodes (:-( 3169 */ 3170 ASSERT(overflow != B_TRUE); 3171 if (overflow == B_TRUE) 3172 return (-1); 3173 3174 item = (struct srat_item *)((uintptr_t)item + item->len); 3175 } 3176 return (domain_cnt); 3177 } 3178 3179 3180 /* 3181 * Parse domain information in ACPI Maximum System Capability Table (MSCT). 3182 * MSCT table has been verified in function process_msct() in fakebop.c. 3183 */ 3184 static int 3185 lgrp_plat_msct_domains(struct msct *tp, uint32_t *prox_domain_min) 3186 { 3187 int last_seen = 0; 3188 uint32_t proxmin = UINT32_MAX; 3189 struct msct_proximity_domain *item, *end; 3190 3191 if (tp == NULL || lgrp_plat_msct_enable == 0) 3192 return (-1); 3193 3194 if (tp->maximum_proximity_domains >= MAX_NODES) { 3195 cmn_err(CE_CONT, 3196 "?lgrp: too many proximity domains (%d), max %d supported, " 3197 "disable support of CPU/memory DR operations.", 3198 tp->maximum_proximity_domains + 1, MAX_NODES); 3199 plat_dr_disable_cpu(); 3200 plat_dr_disable_memory(); 3201 return (-1); 3202 } 3203 3204 if (prox_domain_min != NULL) { 3205 end = (void *)(tp->hdr.len + (uintptr_t)tp); 3206 for (item = (void *)((uintptr_t)tp + 3207 tp->proximity_domain_offset); item < end; 3208 item = (void *)(item->length + (uintptr_t)item)) { 3209 if (item->domain_min < proxmin) { 3210 proxmin = item->domain_min; 3211 } 3212 3213 last_seen = item->domain_max - item->domain_min + 1; 3214 /* 3215 * Break out if all proximity domains have been 3216 * processed. Some BIOSes may have unused items 3217 * at the end of MSCT table. 3218 */ 3219 if (last_seen > tp->maximum_proximity_domains) { 3220 break; 3221 } 3222 } 3223 *prox_domain_min = proxmin; 3224 } 3225 3226 return (tp->maximum_proximity_domains + 1); 3227 } 3228 3229 3230 /* 3231 * Set lgroup latencies for 2 level lgroup topology 3232 */ 3233 static void 3234 lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats) 3235 { 3236 int i, j; 3237 3238 ASSERT(lat_stats != NULL); 3239 3240 if (lgrp_plat_node_cnt >= 4) 3241 cmn_err(CE_NOTE, 3242 "MPO only optimizing for local and remote\n"); 3243 for (i = 0; i < lgrp_plat_node_cnt; i++) { 3244 for (j = 0; j < lgrp_plat_node_cnt; j++) { 3245 if (i == j) 3246 lat_stats->latencies[i][j] = 2; 3247 else 3248 lat_stats->latencies[i][j] = 3; 3249 } 3250 } 3251 lat_stats->latency_min = 2; 3252 lat_stats->latency_max = 3; 3253 /* TODO: check it. */ 3254 lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 3255 lgrp_plat_topo_flatten = 1; 3256 } 3257 3258 3259 /* 3260 * The following Opteron specific constants, macros, types, and routines define 3261 * PCI configuration space registers and how to read them to determine the NUMA 3262 * configuration of *supported* Opteron processors. They provide the same 3263 * information that may be gotten from the ACPI System Resource Affinity Table 3264 * (SRAT) if it exists on the machine of interest. 3265 * 3266 * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 3267 * of interest describes all of these registers and their contents. The main 3268 * registers used by this code to determine the NUMA configuration of the 3269 * machine are the node ID register for the number of NUMA nodes and the DRAM 3270 * address map registers for the physical address range of each node. 3271 * 3272 * NOTE: The format and how to determine the NUMA configuration using PCI 3273 * config space registers may change or may not be supported in future 3274 * Opteron processor families. 3275 */ 3276 3277 /* 3278 * How many bits to shift Opteron DRAM Address Map base and limit registers 3279 * to get actual value 3280 */ 3281 #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 3282 #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 3283 3284 #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 3285 #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 3286 3287 #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 3288 3289 /* 3290 * Macros to derive addresses from Opteron DRAM Address Map registers 3291 */ 3292 #define OPT_DRAMADDR_HI(reg) \ 3293 (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 3294 OPT_DRAMADDR_HI_LSHIFT_ADDR) 3295 3296 #define OPT_DRAMADDR_LO(reg) \ 3297 (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 3298 OPT_DRAMADDR_LO_LSHIFT_ADDR) 3299 3300 #define OPT_DRAMADDR(high, low) \ 3301 (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 3302 3303 /* 3304 * Bit masks defining what's in Opteron DRAM Address Map base register 3305 */ 3306 #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 3307 #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 3308 #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 3309 3310 /* 3311 * Bit masks defining what's in Opteron DRAM Address Map limit register 3312 */ 3313 #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 3314 #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 3315 3316 3317 /* 3318 * Opteron Node ID register in PCI configuration space contains 3319 * number of nodes in system, etc. for Opteron K8. The following 3320 * constants and macros define its contents, structure, and access. 3321 */ 3322 3323 /* 3324 * Bit masks defining what's in Opteron Node ID register 3325 */ 3326 #define OPT_NODE_MASK_ID 0x7 /* node ID */ 3327 #define OPT_NODE_MASK_CNT 0x70 /* node count */ 3328 #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 3329 #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 3330 #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 3331 3332 /* 3333 * How many bits in Opteron Node ID register to shift right to get actual value 3334 */ 3335 #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 3336 3337 /* 3338 * Macros to get values from Opteron Node ID register 3339 */ 3340 #define OPT_NODE_CNT(reg) \ 3341 ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 3342 3343 /* 3344 * Macro to setup PCI Extended Configuration Space (ECS) address to give to 3345 * "in/out" instructions 3346 * 3347 * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 3348 * other uses should just do MMIO to access PCI ECS. 3349 * Must enable special bit in Northbridge Configuration Register on 3350 * Greyhound for extended CF8 space access to be able to access PCI ECS 3351 * using "in/out" instructions and restore special bit after done 3352 * accessing PCI ECS. 3353 */ 3354 #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 3355 (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 3356 (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 3357 ((((reg) >> 8) & 0xf) << 24)) 3358 3359 /* 3360 * PCI configuration space registers accessed by specifying 3361 * a bus, device, function, and offset. The following constants 3362 * define the values needed to access Opteron K8 configuration 3363 * info to determine its node topology 3364 */ 3365 3366 #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 3367 3368 /* 3369 * Opteron PCI configuration space register function values 3370 */ 3371 #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 3372 #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 3373 #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 3374 #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 3375 3376 /* 3377 * PCI Configuration Space register offsets 3378 */ 3379 #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 3380 #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 3381 #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 3382 #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 3383 3384 /* 3385 * Opteron PCI Configuration Space device IDs for nodes 3386 */ 3387 #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 3388 3389 3390 /* 3391 * Opteron DRAM address map gives base and limit for physical memory in a node 3392 */ 3393 typedef struct opt_dram_addr_map { 3394 uint32_t base_hi; 3395 uint32_t base_lo; 3396 uint32_t limit_hi; 3397 uint32_t limit_lo; 3398 } opt_dram_addr_map_t; 3399 3400 3401 /* 3402 * Supported AMD processor families 3403 */ 3404 #define AMD_FAMILY_HAMMER 15 3405 #define AMD_FAMILY_GREYHOUND 16 3406 3407 /* 3408 * Whether to have is_opteron() return 1 even when processor isn't supported 3409 */ 3410 uint_t is_opteron_override = 0; 3411 3412 /* 3413 * AMD processor family for current CPU 3414 */ 3415 uint_t opt_family = 0; 3416 3417 3418 /* 3419 * Determine whether we're running on a supported AMD Opteron since reading 3420 * node count and DRAM address map registers may have different format or 3421 * may not be supported across processor families 3422 */ 3423 static int 3424 is_opteron(void) 3425 { 3426 3427 if (x86_vendor != X86_VENDOR_AMD) 3428 return (0); 3429 3430 opt_family = cpuid_getfamily(CPU); 3431 if (opt_family == AMD_FAMILY_HAMMER || 3432 opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 3433 return (1); 3434 else 3435 return (0); 3436 } 3437 3438 3439 /* 3440 * Determine NUMA configuration for Opteron from registers that live in PCI 3441 * configuration space 3442 */ 3443 static void 3444 opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 3445 memnode_phys_addr_map_t *memnode_info) 3446 { 3447 uint_t bus; 3448 uint_t dev; 3449 struct opt_dram_addr_map dram_map[MAX_NODES]; 3450 uint_t node; 3451 uint_t node_info[MAX_NODES]; 3452 uint_t off_hi; 3453 uint_t off_lo; 3454 uint64_t nb_cfg_reg; 3455 3456 /* 3457 * Read configuration registers from PCI configuration space to 3458 * determine node information, which memory is in each node, etc. 3459 * 3460 * Write to PCI configuration space address register to specify 3461 * which configuration register to read and read/write PCI 3462 * configuration space data register to get/set contents 3463 */ 3464 bus = OPT_PCS_BUS_CONFIG; 3465 dev = OPT_PCS_DEV_NODE0; 3466 off_hi = OPT_PCS_OFF_DRAMBASE_HI; 3467 off_lo = OPT_PCS_OFF_DRAMBASE_LO; 3468 3469 /* 3470 * Read node ID register for node 0 to get node count 3471 */ 3472 node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 3473 OPT_PCS_OFF_NODEID); 3474 *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 3475 3476 /* 3477 * If number of nodes is more than maximum supported, then set node 3478 * count to 1 and treat system as UMA instead of NUMA. 3479 */ 3480 if (*node_cnt > MAX_NODES) { 3481 *node_cnt = 1; 3482 return; 3483 } 3484 3485 /* 3486 * For Greyhound, PCI Extended Configuration Space must be enabled to 3487 * read high DRAM address map base and limit registers 3488 */ 3489 if (opt_family == AMD_FAMILY_GREYHOUND) { 3490 nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 3491 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3492 wrmsr(MSR_AMD_NB_CFG, 3493 nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 3494 } 3495 3496 for (node = 0; node < *node_cnt; node++) { 3497 uint32_t base_hi; 3498 uint32_t base_lo; 3499 uint32_t limit_hi; 3500 uint32_t limit_lo; 3501 3502 /* 3503 * Read node ID register (except for node 0 which we just read) 3504 */ 3505 if (node > 0) { 3506 node_info[node] = pci_getl_func(bus, dev, 3507 OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 3508 } 3509 3510 /* 3511 * Read DRAM base and limit registers which specify 3512 * physical memory range of each node 3513 */ 3514 if (opt_family != AMD_FAMILY_GREYHOUND) 3515 base_hi = 0; 3516 else { 3517 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 3518 OPT_PCS_FUNC_ADDRMAP, off_hi)); 3519 base_hi = dram_map[node].base_hi = 3520 inl(PCI_CONFDATA); 3521 } 3522 base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 3523 OPT_PCS_FUNC_ADDRMAP, off_lo); 3524 3525 if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 3526 mem_intrlv) 3527 *mem_intrlv = *mem_intrlv + 1; 3528 3529 off_hi += 4; /* high limit register offset */ 3530 if (opt_family != AMD_FAMILY_GREYHOUND) 3531 limit_hi = 0; 3532 else { 3533 outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 3534 OPT_PCS_FUNC_ADDRMAP, off_hi)); 3535 limit_hi = dram_map[node].limit_hi = 3536 inl(PCI_CONFDATA); 3537 } 3538 3539 off_lo += 4; /* low limit register offset */ 3540 limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 3541 dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 3542 3543 /* 3544 * Increment device number to next node and register offsets 3545 * for DRAM base register of next node 3546 */ 3547 off_hi += 4; 3548 off_lo += 4; 3549 dev++; 3550 3551 /* 3552 * Both read and write enable bits must be enabled in DRAM 3553 * address map base register for physical memory to exist in 3554 * node 3555 */ 3556 if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 3557 (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 3558 /* 3559 * Mark node memory as non-existent and set start and 3560 * end addresses to be same in memnode_info[] 3561 */ 3562 memnode_info[node].exists = 0; 3563 memnode_info[node].start = memnode_info[node].end = 3564 (pfn_t)-1; 3565 continue; 3566 } 3567 3568 /* 3569 * Mark node memory as existing and remember physical address 3570 * range of each node for use later 3571 */ 3572 memnode_info[node].exists = 1; 3573 3574 memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 3575 3576 memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 3577 OPT_DRAMADDR_LO_MASK_OFF); 3578 } 3579 3580 /* 3581 * Restore PCI Extended Configuration Space enable bit 3582 */ 3583 if (opt_family == AMD_FAMILY_GREYHOUND) { 3584 if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3585 wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 3586 } 3587 } 3588 3589 3590 /* 3591 * Return average amount of time to read vendor ID register on Northbridge 3592 * N times on specified destination node from current CPU 3593 */ 3594 static hrtime_t 3595 opt_probe_vendor(int dest_node, int nreads) 3596 { 3597 int cnt; 3598 uint_t dev; 3599 /* LINTED: set but not used in function */ 3600 volatile uint_t dev_vendor; 3601 hrtime_t elapsed; 3602 hrtime_t end; 3603 int ipl; 3604 hrtime_t start; 3605 3606 dev = OPT_PCS_DEV_NODE0 + dest_node; 3607 kpreempt_disable(); 3608 ipl = spl8(); 3609 outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 3610 OPT_PCS_OFF_VENDOR)); 3611 start = gethrtime(); 3612 for (cnt = 0; cnt < nreads; cnt++) 3613 dev_vendor = inl(PCI_CONFDATA); 3614 end = gethrtime(); 3615 elapsed = (end - start) / nreads; 3616 splx(ipl); 3617 kpreempt_enable(); 3618 return (elapsed); 3619 } 3620