17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5c39996a7Sstevel * Common Development and Distribution License (the "License"). 6c39996a7Sstevel * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 21c39996a7Sstevel 227c478bd9Sstevel@tonic-gate /* 237417cfdeSKuriakose Kuruvilla * Copyright (c) 2004, 2010, Oracle and/or its affiliates. All rights reserved. 247c478bd9Sstevel@tonic-gate */ 25a3114836SGerry Liu /* 26a3114836SGerry Liu * Copyright (c) 2010, Intel Corporation. 27a3114836SGerry Liu * All rights reserved. 28a3114836SGerry Liu */ 297c478bd9Sstevel@tonic-gate 302e2c009bSjjc /* 312e2c009bSjjc * LOCALITY GROUP (LGROUP) PLATFORM SUPPORT FOR X86/AMD64 PLATFORMS 322e2c009bSjjc * ================================================================ 332e2c009bSjjc * Multiprocessor AMD and Intel systems may have Non Uniform Memory Access 342e2c009bSjjc * (NUMA). A NUMA machine consists of one or more "nodes" that each consist of 352e2c009bSjjc * one or more CPUs and some local memory. The CPUs in each node can access 362e2c009bSjjc * the memory in the other nodes but at a higher latency than accessing their 372e2c009bSjjc * local memory. Typically, a system with only one node has Uniform Memory 382e2c009bSjjc * Access (UMA), but it may be possible to have a one node system that has 392e2c009bSjjc * some global memory outside of the node which is higher latency. 402e2c009bSjjc * 412e2c009bSjjc * Module Description 422e2c009bSjjc * ------------------ 432e2c009bSjjc * This module provides a platform interface for determining which CPUs and 442e2c009bSjjc * which memory (and how much) are in a NUMA node and how far each node is from 452e2c009bSjjc * each other. The interface is used by the Virtual Memory (VM) system and the 462e2c009bSjjc * common lgroup framework. The VM system uses the plat_*() routines to fill 472e2c009bSjjc * in its memory node (memnode) array with the physical address range spanned 482e2c009bSjjc * by each NUMA node to know which memory belongs to which node, so it can 492e2c009bSjjc * build and manage a physical page free list for each NUMA node and allocate 502e2c009bSjjc * local memory from each node as needed. The common lgroup framework uses the 512e2c009bSjjc * exported lgrp_plat_*() routines to figure out which CPUs and memory belong 522e2c009bSjjc * to each node (leaf lgroup) and how far each node is from each other, so it 532e2c009bSjjc * can build the latency (lgroup) topology for the machine in order to optimize 542e2c009bSjjc * for locality. Also, an lgroup platform handle instead of lgroups are used 552e2c009bSjjc * in the interface with this module, so this module shouldn't need to know 562e2c009bSjjc * anything about lgroups. Instead, it just needs to know which CPUs, memory, 572e2c009bSjjc * etc. are in each NUMA node, how far each node is from each other, and to use 582e2c009bSjjc * a unique lgroup platform handle to refer to each node through the interface. 592e2c009bSjjc * 602e2c009bSjjc * Determining NUMA Configuration 612e2c009bSjjc * ------------------------------ 622e2c009bSjjc * By default, this module will try to determine the NUMA configuration of the 632e2c009bSjjc * machine by reading the ACPI System Resource Affinity Table (SRAT) and System 642e2c009bSjjc * Locality Information Table (SLIT). The SRAT contains info to tell which 652e2c009bSjjc * CPUs and memory are local to a given proximity domain (NUMA node). The SLIT 662e2c009bSjjc * is a matrix that gives the distance between each system locality (which is 672e2c009bSjjc * a NUMA node and should correspond to proximity domains in the SRAT). For 682e2c009bSjjc * more details on the SRAT and SLIT, please refer to an ACPI 3.0 or newer 692e2c009bSjjc * specification. 702e2c009bSjjc * 712e2c009bSjjc * If the SRAT doesn't exist on a system with AMD Opteron processors, we 722e2c009bSjjc * examine registers in PCI configuration space to determine how many nodes are 732e2c009bSjjc * in the system and which CPUs and memory are in each node. 742e2c009bSjjc * do while booting the kernel. 752e2c009bSjjc * 762e2c009bSjjc * NOTE: Using these PCI configuration space registers to determine this 772e2c009bSjjc * locality info is not guaranteed to work or be compatible across all 782e2c009bSjjc * Opteron processor families. 792e2c009bSjjc * 802e2c009bSjjc * If the SLIT does not exist or look right, the kernel will probe to determine 812e2c009bSjjc * the distance between nodes as long as the NUMA CPU and memory configuration 822e2c009bSjjc * has been determined (see lgrp_plat_probe() for details). 832e2c009bSjjc * 842e2c009bSjjc * Data Structures 852e2c009bSjjc * --------------- 862e2c009bSjjc * The main data structures used by this code are the following: 872e2c009bSjjc * 88dae2fa37Sjjc * - lgrp_plat_cpu_node[] CPU to node ID mapping table indexed by 89dae2fa37Sjjc * CPU ID (only used for SRAT) 902e2c009bSjjc * 912e2c009bSjjc * - lgrp_plat_lat_stats.latencies[][] Table of latencies between same and 922e2c009bSjjc * different nodes indexed by node ID 932e2c009bSjjc * 94a3114836SGerry Liu * - lgrp_plat_node_cnt Number of NUMA nodes in system for 95a3114836SGerry Liu * non-DR-capable systems, 96a3114836SGerry Liu * maximum possible number of NUMA nodes 97a3114836SGerry Liu * in system for DR capable systems. 982e2c009bSjjc * 992e2c009bSjjc * - lgrp_plat_node_domain[] Node ID to proximity domain ID mapping 1002e2c009bSjjc * table indexed by node ID (only used 1012e2c009bSjjc * for SRAT) 1022e2c009bSjjc * 103a3114836SGerry Liu * - lgrp_plat_memnode_info[] Table with physical address range for 104a3114836SGerry Liu * each memory node indexed by memory node 105a3114836SGerry Liu * ID 1062e2c009bSjjc * 1072e2c009bSjjc * The code is implemented to make the following always be true: 1082e2c009bSjjc * 1092e2c009bSjjc * lgroup platform handle == node ID == memnode ID 1102e2c009bSjjc * 1112e2c009bSjjc * Moreover, it allows for the proximity domain ID to be equal to all of the 1122e2c009bSjjc * above as long as the proximity domains IDs are numbered from 0 to <number of 1132e2c009bSjjc * nodes - 1>. This is done by hashing each proximity domain ID into the range 1142e2c009bSjjc * from 0 to <number of nodes - 1>. Then proximity ID N will hash into node ID 1152e2c009bSjjc * N and proximity domain ID N will be entered into lgrp_plat_node_domain[N] 1162e2c009bSjjc * and be assigned node ID N. If the proximity domain IDs aren't numbered 1172e2c009bSjjc * from 0 to <number of nodes - 1>, then hashing the proximity domain IDs into 1182e2c009bSjjc * lgrp_plat_node_domain[] will still work for assigning proximity domain IDs 1192e2c009bSjjc * to node IDs. However, the proximity domain IDs may not map to the 1202e2c009bSjjc * equivalent node ID since we want to keep the node IDs numbered from 0 to 1212e2c009bSjjc * <number of nodes - 1> to minimize cost of searching and potentially space. 12281d9ccb6SJonathan Chew * 123a3114836SGerry Liu * With the introduction of support of memory DR operations on x86 platforms, 124a3114836SGerry Liu * things get a little complicated. The addresses of hot-added memory may not 125a3114836SGerry Liu * be continuous with other memory connected to the same lgrp node. In other 126a3114836SGerry Liu * words, memory addresses may get interleaved among lgrp nodes after memory 127a3114836SGerry Liu * DR operations. To work around this limitation, we have extended the 128a3114836SGerry Liu * relationship between lgrp node and memory node from 1:1 map to 1:N map, 129a3114836SGerry Liu * that means there may be multiple memory nodes associated with a lgrp node 130a3114836SGerry Liu * after memory DR operations. 13181d9ccb6SJonathan Chew * 132a3114836SGerry Liu * To minimize the code changes to support memory DR operations, the 133a3114836SGerry Liu * following policies have been adopted. 134a3114836SGerry Liu * 1) On non-DR-capable systems, the relationship among lgroup platform handle, 135a3114836SGerry Liu * node ID and memnode ID is still kept as: 136a3114836SGerry Liu * lgroup platform handle == node ID == memnode ID 137a3114836SGerry Liu * 2) For memory present at boot time on DR capable platforms, the relationship 138a3114836SGerry Liu * is still kept as is. 139a3114836SGerry Liu * lgroup platform handle == node ID == memnode ID 140a3114836SGerry Liu * 3) For hot-added memory, the relationship between lgrp ID and memnode ID have 141a3114836SGerry Liu * been changed from 1:1 map to 1:N map. Memnode IDs [0 - lgrp_plat_node_cnt) 142a3114836SGerry Liu * are reserved for memory present at boot time, and memnode IDs 143a3114836SGerry Liu * [lgrp_plat_node_cnt, max_mem_nodes) are used to dynamically allocate 144a3114836SGerry Liu * memnode ID for hot-added memory. 145a3114836SGerry Liu * 4) All boot code having the assumption "node ID == memnode ID" can live as 146a3114836SGerry Liu * is, that's because node ID is always equal to memnode ID at boot time. 147a3114836SGerry Liu * 5) The lgrp_plat_memnode_info_update(), plat_pfn_to_mem_node() and 148a3114836SGerry Liu * lgrp_plat_mem_size() related logics have been enhanced to deal with 149a3114836SGerry Liu * the 1:N map relationship. 150a3114836SGerry Liu * 6) The latency probing related logics, which have the assumption 151a3114836SGerry Liu * "node ID == memnode ID" and may be called at run time, is disabled if 152a3114836SGerry Liu * memory DR operation is enabled. 1532e2c009bSjjc */ 1542e2c009bSjjc 1552e2c009bSjjc 1567c478bd9Sstevel@tonic-gate #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 157a3114836SGerry Liu #include <sys/atomic.h> 158dae2fa37Sjjc #include <sys/bootconf.h> 1597c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 160f78a91cdSjjc #include <sys/controlregs.h> 1617c478bd9Sstevel@tonic-gate #include <sys/cpupart.h> 1627c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 1637c478bd9Sstevel@tonic-gate #include <sys/lgrp.h> 1647c478bd9Sstevel@tonic-gate #include <sys/machsystm.h> 1657c478bd9Sstevel@tonic-gate #include <sys/memlist.h> 1667c478bd9Sstevel@tonic-gate #include <sys/memnode.h> 1677c478bd9Sstevel@tonic-gate #include <sys/mman.h> 168a3114836SGerry Liu #include <sys/note.h> 169ef50d8c0Sesaxe #include <sys/pci_cfgspace.h> 170ef50d8c0Sesaxe #include <sys/pci_impl.h> 1717c478bd9Sstevel@tonic-gate #include <sys/param.h> 172fb2f18f8Sesaxe #include <sys/pghw.h> 1737c478bd9Sstevel@tonic-gate #include <sys/promif.h> /* for prom_printf() */ 1742e2c009bSjjc #include <sys/sysmacros.h> 1757c478bd9Sstevel@tonic-gate #include <sys/systm.h> 1767c478bd9Sstevel@tonic-gate #include <sys/thread.h> 1777c478bd9Sstevel@tonic-gate #include <sys/types.h> 1787c478bd9Sstevel@tonic-gate #include <sys/var.h> 1797417cfdeSKuriakose Kuruvilla #include <sys/x86_archext.h> 1807c478bd9Sstevel@tonic-gate #include <vm/hat_i86.h> 1817c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 182affbd3ccSkchow #include <vm/vm_dep.h> 1837c478bd9Sstevel@tonic-gate 184a3114836SGerry Liu #include <sys/acpidev.h> 185*5d02e7e8SToomas Soome #include <sys/acpi/acpi.h> /* for SRAT, SLIT and MSCT */ 1867c478bd9Sstevel@tonic-gate 187*5d02e7e8SToomas Soome /* from fakebop.c */ 188*5d02e7e8SToomas Soome extern ACPI_TABLE_SRAT *srat_ptr; 189*5d02e7e8SToomas Soome extern ACPI_TABLE_SLIT *slit_ptr; 190*5d02e7e8SToomas Soome extern ACPI_TABLE_MSCT *msct_ptr; 1917c478bd9Sstevel@tonic-gate 1927c478bd9Sstevel@tonic-gate #define MAX_NODES 8 1937c478bd9Sstevel@tonic-gate #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 1947c478bd9Sstevel@tonic-gate 1952e2c009bSjjc /* 1962e2c009bSjjc * Constants for configuring probing 1972e2c009bSjjc */ 1987c478bd9Sstevel@tonic-gate #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 1997c478bd9Sstevel@tonic-gate #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 2008949bcd6Sandrei #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 2017c478bd9Sstevel@tonic-gate 2027c478bd9Sstevel@tonic-gate /* 2032e2c009bSjjc * Flags for probing 2042e2c009bSjjc */ 2052e2c009bSjjc #define LGRP_PLAT_PROBE_ENABLE 0x1 /* enable probing */ 2062e2c009bSjjc #define LGRP_PLAT_PROBE_PGCPY 0x2 /* probe using page copy */ 2072e2c009bSjjc #define LGRP_PLAT_PROBE_VENDOR 0x4 /* probe vendor ID register */ 2082e2c009bSjjc 2092e2c009bSjjc /* 21081d9ccb6SJonathan Chew * Hash proximity domain ID into node to domain mapping table "mod" number of 21181d9ccb6SJonathan Chew * nodes to minimize span of entries used and try to have lowest numbered 21281d9ccb6SJonathan Chew * proximity domain be node 0 2132e2c009bSjjc */ 21481d9ccb6SJonathan Chew #define NODE_DOMAIN_HASH(domain, node_cnt) \ 21581d9ccb6SJonathan Chew ((lgrp_plat_prox_domain_min == UINT32_MAX) ? (domain) % node_cnt : \ 21681d9ccb6SJonathan Chew ((domain) - lgrp_plat_prox_domain_min) % node_cnt) 2172e2c009bSjjc 2182e2c009bSjjc /* 219dae2fa37Sjjc * CPU to node ID mapping structure (only used with SRAT) 2202e2c009bSjjc */ 2212e2c009bSjjc typedef struct cpu_node_map { 2222e2c009bSjjc int exists; 2232e2c009bSjjc uint_t node; 2242e2c009bSjjc uint32_t apicid; 2252e2c009bSjjc uint32_t prox_domain; 2262e2c009bSjjc } cpu_node_map_t; 2272e2c009bSjjc 2282e2c009bSjjc /* 2292e2c009bSjjc * Latency statistics 2302e2c009bSjjc */ 2312e2c009bSjjc typedef struct lgrp_plat_latency_stats { 2322e2c009bSjjc hrtime_t latencies[MAX_NODES][MAX_NODES]; 2332e2c009bSjjc hrtime_t latency_max; 2342e2c009bSjjc hrtime_t latency_min; 2352e2c009bSjjc } lgrp_plat_latency_stats_t; 2362e2c009bSjjc 2372e2c009bSjjc /* 2382e2c009bSjjc * Memory configuration for probing 2392e2c009bSjjc */ 2402e2c009bSjjc typedef struct lgrp_plat_probe_mem_config { 2412e2c009bSjjc size_t probe_memsize; /* how much memory to probe per node */ 2422e2c009bSjjc caddr_t probe_va[MAX_NODES]; /* where memory mapped for probing */ 2432e2c009bSjjc pfn_t probe_pfn[MAX_NODES]; /* physical pages to map for probing */ 2442e2c009bSjjc } lgrp_plat_probe_mem_config_t; 2452e2c009bSjjc 2462e2c009bSjjc /* 2472e2c009bSjjc * Statistics kept for probing 2482e2c009bSjjc */ 2492e2c009bSjjc typedef struct lgrp_plat_probe_stats { 2502e2c009bSjjc hrtime_t flush_cost; 2512e2c009bSjjc hrtime_t probe_cost; 2522e2c009bSjjc hrtime_t probe_cost_total; 2532e2c009bSjjc hrtime_t probe_error_code; 2542e2c009bSjjc hrtime_t probe_errors[MAX_NODES][MAX_NODES]; 2552e2c009bSjjc int probe_suspect[MAX_NODES][MAX_NODES]; 2562e2c009bSjjc hrtime_t probe_max[MAX_NODES][MAX_NODES]; 2572e2c009bSjjc hrtime_t probe_min[MAX_NODES][MAX_NODES]; 2582e2c009bSjjc } lgrp_plat_probe_stats_t; 2592e2c009bSjjc 2602e2c009bSjjc /* 2612e2c009bSjjc * Node to proximity domain ID mapping structure (only used with SRAT) 2622e2c009bSjjc */ 2632e2c009bSjjc typedef struct node_domain_map { 2642e2c009bSjjc int exists; 2652e2c009bSjjc uint32_t prox_domain; 2662e2c009bSjjc } node_domain_map_t; 2672e2c009bSjjc 2682e2c009bSjjc /* 269a3114836SGerry Liu * Node ID and starting and ending page for physical memory in memory node 2702e2c009bSjjc */ 271a3114836SGerry Liu typedef struct memnode_phys_addr_map { 2722e2c009bSjjc pfn_t start; 2732e2c009bSjjc pfn_t end; 2742e2c009bSjjc int exists; 2752e2c009bSjjc uint32_t prox_domain; 276a3114836SGerry Liu uint32_t device_id; 277a3114836SGerry Liu uint_t lgrphand; 278a3114836SGerry Liu } memnode_phys_addr_map_t; 2792e2c009bSjjc 280dae2fa37Sjjc /* 281d821f0f0Sjjc * Number of CPUs for which we got APIC IDs 282dae2fa37Sjjc */ 283d821f0f0Sjjc static int lgrp_plat_apic_ncpus = 0; 2842e2c009bSjjc 2852e2c009bSjjc /* 286d5d7cf4eSJonathan Chew * CPU to node ID mapping table (only used for SRAT) and its max number of 287d5d7cf4eSJonathan Chew * entries 2882e2c009bSjjc */ 289d5d7cf4eSJonathan Chew static cpu_node_map_t *lgrp_plat_cpu_node = NULL; 290d5d7cf4eSJonathan Chew static uint_t lgrp_plat_cpu_node_nentries = 0; 2912e2c009bSjjc 2922e2c009bSjjc /* 2932e2c009bSjjc * Latency statistics 2942e2c009bSjjc */ 2952e2c009bSjjc lgrp_plat_latency_stats_t lgrp_plat_lat_stats; 2962e2c009bSjjc 2972e2c009bSjjc /* 2982e2c009bSjjc * Whether memory is interleaved across nodes causing MPO to be disabled 2992e2c009bSjjc */ 3002e2c009bSjjc static int lgrp_plat_mem_intrlv = 0; 3012e2c009bSjjc 3022e2c009bSjjc /* 3032e2c009bSjjc * Node ID to proximity domain ID mapping table (only used for SRAT) 3042e2c009bSjjc */ 3052e2c009bSjjc static node_domain_map_t lgrp_plat_node_domain[MAX_NODES]; 3062e2c009bSjjc 3072e2c009bSjjc /* 3082e2c009bSjjc * Physical address range for memory in each node 3092e2c009bSjjc */ 310a3114836SGerry Liu static memnode_phys_addr_map_t lgrp_plat_memnode_info[MAX_MEM_NODES]; 3112e2c009bSjjc 3122e2c009bSjjc /* 3132e2c009bSjjc * Statistics gotten from probing 3142e2c009bSjjc */ 3152e2c009bSjjc static lgrp_plat_probe_stats_t lgrp_plat_probe_stats; 3162e2c009bSjjc 3172e2c009bSjjc /* 3182e2c009bSjjc * Memory configuration for probing 3192e2c009bSjjc */ 3202e2c009bSjjc static lgrp_plat_probe_mem_config_t lgrp_plat_probe_mem_config; 3212e2c009bSjjc 3222e2c009bSjjc /* 32381d9ccb6SJonathan Chew * Lowest proximity domain ID seen in ACPI SRAT 32481d9ccb6SJonathan Chew */ 32581d9ccb6SJonathan Chew static uint32_t lgrp_plat_prox_domain_min = UINT32_MAX; 32681d9ccb6SJonathan Chew 32781d9ccb6SJonathan Chew /* 3282e2c009bSjjc * Error code from processing ACPI SRAT 3292e2c009bSjjc */ 3302e2c009bSjjc static int lgrp_plat_srat_error = 0; 3312e2c009bSjjc 3322e2c009bSjjc /* 3332e2c009bSjjc * Error code from processing ACPI SLIT 3342e2c009bSjjc */ 3352e2c009bSjjc static int lgrp_plat_slit_error = 0; 3362e2c009bSjjc 3372e2c009bSjjc /* 338a3114836SGerry Liu * Whether lgrp topology has been flattened to 2 levels. 339a3114836SGerry Liu */ 340a3114836SGerry Liu static int lgrp_plat_topo_flatten = 0; 341a3114836SGerry Liu 342a3114836SGerry Liu 343a3114836SGerry Liu /* 344a3114836SGerry Liu * Maximum memory node ID in use. 345a3114836SGerry Liu */ 346a3114836SGerry Liu static uint_t lgrp_plat_max_mem_node; 347a3114836SGerry Liu 348a3114836SGerry Liu /* 3492e2c009bSjjc * Allocate lgroup array statically 3502e2c009bSjjc */ 3512e2c009bSjjc static lgrp_t lgrp_space[NLGRP]; 3522e2c009bSjjc static int nlgrps_alloc; 3532e2c009bSjjc 3542e2c009bSjjc 3552e2c009bSjjc /* 35681d9ccb6SJonathan Chew * Enable finding and using minimum proximity domain ID when hashing 35781d9ccb6SJonathan Chew */ 35881d9ccb6SJonathan Chew int lgrp_plat_domain_min_enable = 1; 35981d9ccb6SJonathan Chew 36081d9ccb6SJonathan Chew /* 361a3114836SGerry Liu * Maximum possible number of nodes in system 3622e2c009bSjjc */ 3632e2c009bSjjc uint_t lgrp_plat_node_cnt = 1; 3642e2c009bSjjc 3652e2c009bSjjc /* 36681d9ccb6SJonathan Chew * Enable sorting nodes in ascending order by starting physical address 36781d9ccb6SJonathan Chew */ 36881d9ccb6SJonathan Chew int lgrp_plat_node_sort_enable = 1; 36981d9ccb6SJonathan Chew 37081d9ccb6SJonathan Chew /* 3712e2c009bSjjc * Configuration Parameters for Probing 3722e2c009bSjjc * - lgrp_plat_probe_flags Flags to specify enabling probing, probe 3732e2c009bSjjc * operation, etc. 3742e2c009bSjjc * - lgrp_plat_probe_nrounds How many rounds of probing to do 3752e2c009bSjjc * - lgrp_plat_probe_nsamples Number of samples to take when probing each 3762e2c009bSjjc * node 3772e2c009bSjjc * - lgrp_plat_probe_nreads Number of times to read vendor ID from 3782e2c009bSjjc * Northbridge for each probe 3792e2c009bSjjc */ 3802e2c009bSjjc uint_t lgrp_plat_probe_flags = 0; 3812e2c009bSjjc int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 3822e2c009bSjjc int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 3832e2c009bSjjc int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 3842e2c009bSjjc 3852e2c009bSjjc /* 386a3114836SGerry Liu * Enable use of ACPI System Resource Affinity Table (SRAT), System 387a3114836SGerry Liu * Locality Information Table (SLIT) and Maximum System Capability Table (MSCT) 3882e2c009bSjjc */ 3892e2c009bSjjc int lgrp_plat_srat_enable = 1; 3902e2c009bSjjc int lgrp_plat_slit_enable = 1; 391a3114836SGerry Liu int lgrp_plat_msct_enable = 1; 3922e2c009bSjjc 3932e2c009bSjjc /* 39418968004SKit Chow * mnode_xwa: set to non-zero value to initiate workaround if large pages are 39518968004SKit Chow * found to be crossing memory node boundaries. The workaround will eliminate 39618968004SKit Chow * a base size page at the end of each memory node boundary to ensure that 39718968004SKit Chow * a large page with constituent pages that span more than 1 memory node 39818968004SKit Chow * can never be formed. 39918968004SKit Chow * 40018968004SKit Chow */ 40118968004SKit Chow int mnode_xwa = 1; 40218968004SKit Chow 40318968004SKit Chow /* 4042e2c009bSjjc * Static array to hold lgroup statistics 4052e2c009bSjjc */ 4062e2c009bSjjc struct lgrp_stats lgrp_stats[NLGRP]; 4072e2c009bSjjc 4082e2c009bSjjc 4092e2c009bSjjc /* 4102e2c009bSjjc * Forward declarations of platform interface routines 4112e2c009bSjjc */ 4122e2c009bSjjc void plat_build_mem_nodes(struct memlist *list); 4132e2c009bSjjc 414a3114836SGerry Liu int plat_mnode_xcheck(pfn_t pfncnt); 4152e2c009bSjjc 4162e2c009bSjjc lgrp_handle_t plat_mem_node_to_lgrphand(int mnode); 4172e2c009bSjjc 4182e2c009bSjjc int plat_pfn_to_mem_node(pfn_t pfn); 4192e2c009bSjjc 4202e2c009bSjjc /* 4212e2c009bSjjc * Forward declarations of lgroup platform interface routines 4222e2c009bSjjc */ 4232e2c009bSjjc lgrp_t *lgrp_plat_alloc(lgrp_id_t lgrpid); 4242e2c009bSjjc 4252e2c009bSjjc void lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg); 4262e2c009bSjjc 4272e2c009bSjjc lgrp_handle_t lgrp_plat_cpu_to_hand(processorid_t id); 4282e2c009bSjjc 429d5d7cf4eSJonathan Chew void lgrp_plat_init(lgrp_init_stages_t stage); 4302e2c009bSjjc 4312e2c009bSjjc int lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to); 4322e2c009bSjjc 4332e2c009bSjjc int lgrp_plat_max_lgrps(void); 4342e2c009bSjjc 4352e2c009bSjjc pgcnt_t lgrp_plat_mem_size(lgrp_handle_t plathand, 4362e2c009bSjjc lgrp_mem_query_t query); 4372e2c009bSjjc 4382e2c009bSjjc lgrp_handle_t lgrp_plat_pfn_to_hand(pfn_t pfn); 4392e2c009bSjjc 4402e2c009bSjjc void lgrp_plat_probe(void); 4412e2c009bSjjc 4422e2c009bSjjc lgrp_handle_t lgrp_plat_root_hand(void); 4432e2c009bSjjc 4442e2c009bSjjc 4452e2c009bSjjc /* 4462e2c009bSjjc * Forward declarations of local routines 4472e2c009bSjjc */ 4482e2c009bSjjc static int is_opteron(void); 4492e2c009bSjjc 450dae2fa37Sjjc static int lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, 451d821f0f0Sjjc int node_cnt, cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, 452d821f0f0Sjjc uint32_t domain); 453dae2fa37Sjjc 454d5d7cf4eSJonathan Chew static int lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 455d5d7cf4eSJonathan Chew int cpu_node_nentries); 4562e2c009bSjjc 4572e2c009bSjjc static int lgrp_plat_domain_to_node(node_domain_map_t *node_domain, 458d821f0f0Sjjc int node_cnt, uint32_t domain); 4592e2c009bSjjc 460d5d7cf4eSJonathan Chew static void lgrp_plat_get_numa_config(void); 461d5d7cf4eSJonathan Chew 462a3114836SGerry Liu static void lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, 4632e2c009bSjjc lgrp_plat_latency_stats_t *lat_stats, 4642e2c009bSjjc lgrp_plat_probe_stats_t *probe_stats); 4652e2c009bSjjc 466a3114836SGerry Liu static int lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, 4672e2c009bSjjc lgrp_plat_latency_stats_t *lat_stats); 4682e2c009bSjjc 469d5d7cf4eSJonathan Chew static void lgrp_plat_main_init(void); 470d5d7cf4eSJonathan Chew 4712e2c009bSjjc static pgcnt_t lgrp_plat_mem_size_default(lgrp_handle_t, lgrp_mem_query_t); 4722e2c009bSjjc 4732e2c009bSjjc static int lgrp_plat_node_domain_update(node_domain_map_t *node_domain, 474d821f0f0Sjjc int node_cnt, uint32_t domain); 4752e2c009bSjjc 476a3114836SGerry Liu static int lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, 477a3114836SGerry Liu int node_cnt, memnode_phys_addr_map_t *memnode_info, int memnode_cnt, 478a3114836SGerry Liu uint64_t start, uint64_t end, uint32_t domain, uint32_t device_id); 4792e2c009bSjjc 48081d9ccb6SJonathan Chew static void lgrp_plat_node_sort(node_domain_map_t *node_domain, 48181d9ccb6SJonathan Chew int node_cnt, cpu_node_map_t *cpu_node, int cpu_count, 482a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info); 48381d9ccb6SJonathan Chew 4842e2c009bSjjc static hrtime_t lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, 485d5d7cf4eSJonathan Chew int cpu_node_nentries, lgrp_plat_probe_mem_config_t *probe_mem_config, 486d5d7cf4eSJonathan Chew lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats); 4872e2c009bSjjc 488d821f0f0Sjjc static int lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node); 489dae2fa37Sjjc 490*5d02e7e8SToomas Soome static int lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp, 491a3114836SGerry Liu node_domain_map_t *node_domain, uint_t node_cnt, 492a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info, 493a3114836SGerry Liu lgrp_plat_latency_stats_t *lat_stats); 4942e2c009bSjjc 495a3114836SGerry Liu static int lgrp_plat_process_sli(uint32_t domain, uchar_t *sli_info, 496a3114836SGerry Liu uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, 497a3114836SGerry Liu lgrp_plat_latency_stats_t *lat_stats); 498a3114836SGerry Liu 499*5d02e7e8SToomas Soome static int lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, 50081d9ccb6SJonathan Chew uint32_t *prox_domain_min, node_domain_map_t *node_domain, 50181d9ccb6SJonathan Chew cpu_node_map_t *cpu_node, int cpu_count, 502a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info); 5032e2c009bSjjc 504d5d7cf4eSJonathan Chew static void lgrp_plat_release_bootstrap(void); 505d5d7cf4eSJonathan Chew 506*5d02e7e8SToomas Soome static int lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp, 50781d9ccb6SJonathan Chew uint32_t *prox_domain_min); 5082e2c009bSjjc 509*5d02e7e8SToomas Soome static int lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp, 510a3114836SGerry Liu uint32_t *prox_domain_min); 511a3114836SGerry Liu 512a3114836SGerry Liu static void lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats); 5132e2c009bSjjc 5142e2c009bSjjc static void opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 515a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info); 5162e2c009bSjjc 5172e2c009bSjjc static hrtime_t opt_probe_vendor(int dest_node, int nreads); 5182e2c009bSjjc 5192e2c009bSjjc 5202e2c009bSjjc /* 5212e2c009bSjjc * PLATFORM INTERFACE ROUTINES 5227c478bd9Sstevel@tonic-gate */ 5237c478bd9Sstevel@tonic-gate 5247c478bd9Sstevel@tonic-gate /* 5252e2c009bSjjc * Configure memory nodes for machines with more than one node (ie NUMA) 5262e2c009bSjjc */ 5272e2c009bSjjc void 5282e2c009bSjjc plat_build_mem_nodes(struct memlist *list) 5292e2c009bSjjc { 5302e2c009bSjjc pfn_t cur_start; /* start addr of subrange */ 5312e2c009bSjjc pfn_t cur_end; /* end addr of subrange */ 5322e2c009bSjjc pfn_t start; /* start addr of whole range */ 5332e2c009bSjjc pfn_t end; /* end addr of whole range */ 53418968004SKit Chow pgcnt_t endcnt; /* pages to sacrifice */ 5352e2c009bSjjc 5362e2c009bSjjc /* 5372e2c009bSjjc * Boot install lists are arranged <addr, len>, ... 5382e2c009bSjjc */ 5392e2c009bSjjc while (list) { 5402e2c009bSjjc int node; 5412e2c009bSjjc 54256f33205SJonathan Adams start = list->ml_address >> PAGESHIFT; 54356f33205SJonathan Adams end = (list->ml_address + list->ml_size - 1) >> PAGESHIFT; 5442e2c009bSjjc 5452e2c009bSjjc if (start > physmax) { 54656f33205SJonathan Adams list = list->ml_next; 5472e2c009bSjjc continue; 5482e2c009bSjjc } 5492e2c009bSjjc if (end > physmax) 5502e2c009bSjjc end = physmax; 5512e2c009bSjjc 5522e2c009bSjjc /* 5532e2c009bSjjc * When there is only one memnode, just add memory to memnode 5542e2c009bSjjc */ 5552e2c009bSjjc if (max_mem_nodes == 1) { 5562e2c009bSjjc mem_node_add_slice(start, end); 55756f33205SJonathan Adams list = list->ml_next; 5582e2c009bSjjc continue; 5592e2c009bSjjc } 5602e2c009bSjjc 5612e2c009bSjjc /* 5622e2c009bSjjc * mem_node_add_slice() expects to get a memory range that 5632e2c009bSjjc * is within one memnode, so need to split any memory range 5642e2c009bSjjc * that spans multiple memnodes into subranges that are each 5652e2c009bSjjc * contained within one memnode when feeding them to 5662e2c009bSjjc * mem_node_add_slice() 5672e2c009bSjjc */ 5682e2c009bSjjc cur_start = start; 5692e2c009bSjjc do { 5702e2c009bSjjc node = plat_pfn_to_mem_node(cur_start); 5712e2c009bSjjc 5722e2c009bSjjc /* 5732e2c009bSjjc * Panic if DRAM address map registers or SRAT say 5742e2c009bSjjc * memory in node doesn't exist or address from 5752e2c009bSjjc * boot installed memory list entry isn't in this node. 5762e2c009bSjjc * This shouldn't happen and rest of code can't deal 5772e2c009bSjjc * with this if it does. 5782e2c009bSjjc */ 579a3114836SGerry Liu if (node < 0 || node >= lgrp_plat_max_mem_node || 580a3114836SGerry Liu !lgrp_plat_memnode_info[node].exists || 581a3114836SGerry Liu cur_start < lgrp_plat_memnode_info[node].start || 582a3114836SGerry Liu cur_start > lgrp_plat_memnode_info[node].end) { 5832e2c009bSjjc cmn_err(CE_PANIC, "Don't know which memnode " 5842e2c009bSjjc "to add installed memory address 0x%lx\n", 5852e2c009bSjjc cur_start); 5862e2c009bSjjc } 5872e2c009bSjjc 5882e2c009bSjjc /* 5892e2c009bSjjc * End of current subrange should not span memnodes 5902e2c009bSjjc */ 5912e2c009bSjjc cur_end = end; 59218968004SKit Chow endcnt = 0; 593a3114836SGerry Liu if (lgrp_plat_memnode_info[node].exists && 594a3114836SGerry Liu cur_end > lgrp_plat_memnode_info[node].end) { 595a3114836SGerry Liu cur_end = lgrp_plat_memnode_info[node].end; 59618968004SKit Chow if (mnode_xwa > 1) { 59718968004SKit Chow /* 59818968004SKit Chow * sacrifice the last page in each 59918968004SKit Chow * node to eliminate large pages 60018968004SKit Chow * that span more than 1 memory node. 60118968004SKit Chow */ 60218968004SKit Chow endcnt = 1; 603bcee7a0bSKit Chow physinstalled--; 60418968004SKit Chow } 60518968004SKit Chow } 6062e2c009bSjjc 60718968004SKit Chow mem_node_add_slice(cur_start, cur_end - endcnt); 6082e2c009bSjjc 6092e2c009bSjjc /* 6102e2c009bSjjc * Next subrange starts after end of current one 6112e2c009bSjjc */ 6122e2c009bSjjc cur_start = cur_end + 1; 6132e2c009bSjjc } while (cur_end < end); 6142e2c009bSjjc 61556f33205SJonathan Adams list = list->ml_next; 6162e2c009bSjjc } 6172e2c009bSjjc mem_node_physalign = 0; 6182e2c009bSjjc mem_node_pfn_shift = 0; 6192e2c009bSjjc } 6202e2c009bSjjc 6212e2c009bSjjc 6222e2c009bSjjc /* 6232e2c009bSjjc * plat_mnode_xcheck: checks the node memory ranges to see if there is a pfncnt 6242e2c009bSjjc * range of pages aligned on pfncnt that crosses an node boundary. Returns 1 if 6252e2c009bSjjc * a crossing is found and returns 0 otherwise. 6262e2c009bSjjc */ 6272e2c009bSjjc int 6282e2c009bSjjc plat_mnode_xcheck(pfn_t pfncnt) 6292e2c009bSjjc { 6302e2c009bSjjc int node, prevnode = -1, basenode; 6312e2c009bSjjc pfn_t ea, sa; 6322e2c009bSjjc 633a3114836SGerry Liu for (node = 0; node < lgrp_plat_max_mem_node; node++) { 6342e2c009bSjjc 635a3114836SGerry Liu if (lgrp_plat_memnode_info[node].exists == 0) 6362e2c009bSjjc continue; 6372e2c009bSjjc 6382e2c009bSjjc if (prevnode == -1) { 6392e2c009bSjjc prevnode = node; 6402e2c009bSjjc basenode = node; 6412e2c009bSjjc continue; 6422e2c009bSjjc } 6432e2c009bSjjc 6442e2c009bSjjc /* assume x86 node pfn ranges are in increasing order */ 645a3114836SGerry Liu ASSERT(lgrp_plat_memnode_info[node].start > 646a3114836SGerry Liu lgrp_plat_memnode_info[prevnode].end); 6472e2c009bSjjc 6482e2c009bSjjc /* 6492e2c009bSjjc * continue if the starting address of node is not contiguous 6502e2c009bSjjc * with the previous node. 6512e2c009bSjjc */ 6522e2c009bSjjc 653a3114836SGerry Liu if (lgrp_plat_memnode_info[node].start != 654a3114836SGerry Liu (lgrp_plat_memnode_info[prevnode].end + 1)) { 6552e2c009bSjjc basenode = node; 6562e2c009bSjjc prevnode = node; 6572e2c009bSjjc continue; 6582e2c009bSjjc } 6592e2c009bSjjc 6602e2c009bSjjc /* check if the starting address of node is pfncnt aligned */ 661a3114836SGerry Liu if ((lgrp_plat_memnode_info[node].start & (pfncnt - 1)) != 0) { 6622e2c009bSjjc 6632e2c009bSjjc /* 6642e2c009bSjjc * at this point, node starts at an unaligned boundary 6652e2c009bSjjc * and is contiguous with the previous node(s) to 6662e2c009bSjjc * basenode. Check if there is an aligned contiguous 6672e2c009bSjjc * range of length pfncnt that crosses this boundary. 6682e2c009bSjjc */ 6692e2c009bSjjc 670a3114836SGerry Liu sa = P2ALIGN(lgrp_plat_memnode_info[prevnode].end, 6712e2c009bSjjc pfncnt); 672a3114836SGerry Liu ea = P2ROUNDUP((lgrp_plat_memnode_info[node].start), 6732e2c009bSjjc pfncnt); 6742e2c009bSjjc 6752e2c009bSjjc ASSERT((ea - sa) == pfncnt); 676a3114836SGerry Liu if (sa >= lgrp_plat_memnode_info[basenode].start && 677a3114836SGerry Liu ea <= (lgrp_plat_memnode_info[node].end + 1)) { 67818968004SKit Chow /* 67918968004SKit Chow * large page found to cross mnode boundary. 68018968004SKit Chow * Return Failure if workaround not enabled. 68118968004SKit Chow */ 68218968004SKit Chow if (mnode_xwa == 0) 6832e2c009bSjjc return (1); 68418968004SKit Chow mnode_xwa++; 68518968004SKit Chow } 6862e2c009bSjjc } 6872e2c009bSjjc prevnode = node; 6882e2c009bSjjc } 6892e2c009bSjjc return (0); 6902e2c009bSjjc } 6912e2c009bSjjc 6922e2c009bSjjc 6932e2c009bSjjc lgrp_handle_t 6942e2c009bSjjc plat_mem_node_to_lgrphand(int mnode) 6952e2c009bSjjc { 6962e2c009bSjjc if (max_mem_nodes == 1) 6972e2c009bSjjc return (LGRP_DEFAULT_HANDLE); 6982e2c009bSjjc 699a3114836SGerry Liu ASSERT(0 <= mnode && mnode < lgrp_plat_max_mem_node); 7002e2c009bSjjc 701a3114836SGerry Liu return ((lgrp_handle_t)(lgrp_plat_memnode_info[mnode].lgrphand)); 702a3114836SGerry Liu } 7032e2c009bSjjc 7042e2c009bSjjc int 7052e2c009bSjjc plat_pfn_to_mem_node(pfn_t pfn) 7062e2c009bSjjc { 7072e2c009bSjjc int node; 7082e2c009bSjjc 7092e2c009bSjjc if (max_mem_nodes == 1) 7102e2c009bSjjc return (0); 7112e2c009bSjjc 712a3114836SGerry Liu for (node = 0; node < lgrp_plat_max_mem_node; node++) { 7132e2c009bSjjc /* 7142e2c009bSjjc * Skip nodes with no memory 7152e2c009bSjjc */ 716a3114836SGerry Liu if (!lgrp_plat_memnode_info[node].exists) 7172e2c009bSjjc continue; 7182e2c009bSjjc 719a3114836SGerry Liu membar_consumer(); 720a3114836SGerry Liu if (pfn >= lgrp_plat_memnode_info[node].start && 721a3114836SGerry Liu pfn <= lgrp_plat_memnode_info[node].end) 7222e2c009bSjjc return (node); 7232e2c009bSjjc } 7242e2c009bSjjc 7252e2c009bSjjc /* 7262e2c009bSjjc * Didn't find memnode where this PFN lives which should never happen 7272e2c009bSjjc */ 728a3114836SGerry Liu ASSERT(node < lgrp_plat_max_mem_node); 7292e2c009bSjjc return (-1); 7302e2c009bSjjc } 7312e2c009bSjjc 7322e2c009bSjjc 7332e2c009bSjjc /* 7342e2c009bSjjc * LGROUP PLATFORM INTERFACE ROUTINES 7352e2c009bSjjc */ 7362e2c009bSjjc 7372e2c009bSjjc /* 7382e2c009bSjjc * Allocate additional space for an lgroup. 7392e2c009bSjjc */ 7402e2c009bSjjc lgrp_t * 7412e2c009bSjjc lgrp_plat_alloc(lgrp_id_t lgrpid) 7422e2c009bSjjc { 7432e2c009bSjjc lgrp_t *lgrp; 7442e2c009bSjjc 7452e2c009bSjjc lgrp = &lgrp_space[nlgrps_alloc++]; 7462e2c009bSjjc if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 7472e2c009bSjjc return (NULL); 7482e2c009bSjjc return (lgrp); 7492e2c009bSjjc } 7502e2c009bSjjc 7512e2c009bSjjc 7522e2c009bSjjc /* 7532e2c009bSjjc * Platform handling for (re)configuration changes 754a3114836SGerry Liu * 755a3114836SGerry Liu * Mechanism to protect lgrp_plat_cpu_node[] at CPU hotplug: 756a3114836SGerry Liu * 1) Use cpu_lock to synchronize between lgrp_plat_config() and 757a3114836SGerry Liu * lgrp_plat_cpu_to_hand(). 758a3114836SGerry Liu * 2) Disable latency probing logic by making sure that the flag 759a3114836SGerry Liu * LGRP_PLAT_PROBE_ENABLE is cleared. 760a3114836SGerry Liu * 761a3114836SGerry Liu * Mechanism to protect lgrp_plat_memnode_info[] at memory hotplug: 762a3114836SGerry Liu * 1) Only inserts into lgrp_plat_memnode_info at memory hotplug, no removal. 763a3114836SGerry Liu * 2) Only expansion to existing entries, no shrinking. 764a3114836SGerry Liu * 3) On writing side, DR framework ensures that lgrp_plat_config() is called 765a3114836SGerry Liu * in single-threaded context. And membar_producer() is used to ensure that 766a3114836SGerry Liu * all changes are visible to other CPUs before setting the "exists" flag. 767a3114836SGerry Liu * 4) On reading side, membar_consumer() after checking the "exists" flag 768a3114836SGerry Liu * ensures that right values are retrieved. 769a3114836SGerry Liu * 770a3114836SGerry Liu * Mechanism to protect lgrp_plat_node_domain[] at hotplug: 771a3114836SGerry Liu * 1) Only insertion into lgrp_plat_node_domain at hotplug, no removal. 772a3114836SGerry Liu * 2) On writing side, it's single-threaded and membar_producer() is used to 773a3114836SGerry Liu * ensure all changes are visible to other CPUs before setting the "exists" 774a3114836SGerry Liu * flag. 775a3114836SGerry Liu * 3) On reading side, membar_consumer() after checking the "exists" flag 776a3114836SGerry Liu * ensures that right values are retrieved. 7772e2c009bSjjc */ 7782e2c009bSjjc void 7792e2c009bSjjc lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 7802e2c009bSjjc { 781a3114836SGerry Liu #ifdef __xpv 782a3114836SGerry Liu _NOTE(ARGUNUSED(flag, arg)); 783a3114836SGerry Liu #else 784a3114836SGerry Liu int rc, node; 785a3114836SGerry Liu cpu_t *cp; 786a3114836SGerry Liu void *hdl = NULL; 787a3114836SGerry Liu uchar_t *sliptr = NULL; 788a3114836SGerry Liu uint32_t domain, apicid, slicnt = 0; 789a3114836SGerry Liu update_membounds_t *mp; 790a3114836SGerry Liu 791a3114836SGerry Liu extern int acpidev_dr_get_cpu_numa_info(cpu_t *, void **, uint32_t *, 792a3114836SGerry Liu uint32_t *, uint32_t *, uchar_t **); 793a3114836SGerry Liu extern void acpidev_dr_free_cpu_numa_info(void *); 794a3114836SGerry Liu 795a3114836SGerry Liu /* 796a3114836SGerry Liu * This interface is used to support CPU/memory DR operations. 797a3114836SGerry Liu * Don't bother here if it's still during boot or only one lgrp node 798a3114836SGerry Liu * is supported. 799a3114836SGerry Liu */ 800a3114836SGerry Liu if (!lgrp_topo_initialized || lgrp_plat_node_cnt == 1) 801a3114836SGerry Liu return; 802a3114836SGerry Liu 803a3114836SGerry Liu switch (flag) { 804a3114836SGerry Liu case LGRP_CONFIG_CPU_ADD: 805a3114836SGerry Liu cp = (cpu_t *)arg; 806a3114836SGerry Liu ASSERT(cp != NULL); 807a3114836SGerry Liu ASSERT(MUTEX_HELD(&cpu_lock)); 808a3114836SGerry Liu 809a3114836SGerry Liu /* Check whether CPU already exists. */ 810a3114836SGerry Liu ASSERT(!lgrp_plat_cpu_node[cp->cpu_id].exists); 811a3114836SGerry Liu if (lgrp_plat_cpu_node[cp->cpu_id].exists) { 812a3114836SGerry Liu cmn_err(CE_WARN, 813a3114836SGerry Liu "!lgrp: CPU(%d) already exists in cpu_node map.", 814a3114836SGerry Liu cp->cpu_id); 815a3114836SGerry Liu break; 816a3114836SGerry Liu } 817a3114836SGerry Liu 818a3114836SGerry Liu /* Query CPU lgrp information. */ 819a3114836SGerry Liu rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, 820a3114836SGerry Liu &slicnt, &sliptr); 821a3114836SGerry Liu ASSERT(rc == 0); 822a3114836SGerry Liu if (rc != 0) { 823a3114836SGerry Liu cmn_err(CE_WARN, 824a3114836SGerry Liu "!lgrp: failed to query lgrp info for CPU(%d).", 825a3114836SGerry Liu cp->cpu_id); 826a3114836SGerry Liu break; 827a3114836SGerry Liu } 828a3114836SGerry Liu 829a3114836SGerry Liu /* Update node to proximity domain mapping */ 830a3114836SGerry Liu node = lgrp_plat_domain_to_node(lgrp_plat_node_domain, 831a3114836SGerry Liu lgrp_plat_node_cnt, domain); 832a3114836SGerry Liu if (node == -1) { 833a3114836SGerry Liu node = lgrp_plat_node_domain_update( 834a3114836SGerry Liu lgrp_plat_node_domain, lgrp_plat_node_cnt, domain); 835a3114836SGerry Liu ASSERT(node != -1); 836a3114836SGerry Liu if (node == -1) { 837a3114836SGerry Liu acpidev_dr_free_cpu_numa_info(hdl); 838a3114836SGerry Liu cmn_err(CE_WARN, "!lgrp: failed to update " 839a3114836SGerry Liu "node_domain map for domain(%u).", domain); 840a3114836SGerry Liu break; 841a3114836SGerry Liu } 842a3114836SGerry Liu } 843a3114836SGerry Liu 844a3114836SGerry Liu /* Update latency information among lgrps. */ 845a3114836SGerry Liu if (slicnt != 0 && sliptr != NULL) { 846a3114836SGerry Liu if (lgrp_plat_process_sli(domain, sliptr, slicnt, 847a3114836SGerry Liu lgrp_plat_node_domain, lgrp_plat_node_cnt, 848a3114836SGerry Liu &lgrp_plat_lat_stats) != 0) { 849a3114836SGerry Liu cmn_err(CE_WARN, "!lgrp: failed to update " 850a3114836SGerry Liu "latency information for domain (%u).", 851a3114836SGerry Liu domain); 852a3114836SGerry Liu } 853a3114836SGerry Liu } 854a3114836SGerry Liu 855a3114836SGerry Liu /* Update CPU to node mapping. */ 856a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].prox_domain = domain; 857a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].node = node; 858a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].apicid = apicid; 859a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].exists = 1; 860a3114836SGerry Liu lgrp_plat_apic_ncpus++; 861a3114836SGerry Liu 862a3114836SGerry Liu acpidev_dr_free_cpu_numa_info(hdl); 863a3114836SGerry Liu break; 864a3114836SGerry Liu 865a3114836SGerry Liu case LGRP_CONFIG_CPU_DEL: 866a3114836SGerry Liu cp = (cpu_t *)arg; 867a3114836SGerry Liu ASSERT(cp != NULL); 868a3114836SGerry Liu ASSERT(MUTEX_HELD(&cpu_lock)); 869a3114836SGerry Liu 870a3114836SGerry Liu /* Check whether CPU exists. */ 871a3114836SGerry Liu ASSERT(lgrp_plat_cpu_node[cp->cpu_id].exists); 872a3114836SGerry Liu if (!lgrp_plat_cpu_node[cp->cpu_id].exists) { 873a3114836SGerry Liu cmn_err(CE_WARN, 874a3114836SGerry Liu "!lgrp: CPU(%d) doesn't exist in cpu_node map.", 875a3114836SGerry Liu cp->cpu_id); 876a3114836SGerry Liu break; 877a3114836SGerry Liu } 878a3114836SGerry Liu 879a3114836SGerry Liu /* Query CPU lgrp information. */ 880a3114836SGerry Liu rc = acpidev_dr_get_cpu_numa_info(cp, &hdl, &apicid, &domain, 881a3114836SGerry Liu NULL, NULL); 882a3114836SGerry Liu ASSERT(rc == 0); 883a3114836SGerry Liu if (rc != 0) { 884a3114836SGerry Liu cmn_err(CE_WARN, 885a3114836SGerry Liu "!lgrp: failed to query lgrp info for CPU(%d).", 886a3114836SGerry Liu cp->cpu_id); 887a3114836SGerry Liu break; 888a3114836SGerry Liu } 889a3114836SGerry Liu 890a3114836SGerry Liu /* Update map. */ 891a3114836SGerry Liu ASSERT(lgrp_plat_cpu_node[cp->cpu_id].apicid == apicid); 892a3114836SGerry Liu ASSERT(lgrp_plat_cpu_node[cp->cpu_id].prox_domain == domain); 893a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].exists = 0; 894a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].apicid = UINT32_MAX; 895a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].prox_domain = UINT32_MAX; 896a3114836SGerry Liu lgrp_plat_cpu_node[cp->cpu_id].node = UINT_MAX; 897a3114836SGerry Liu lgrp_plat_apic_ncpus--; 898a3114836SGerry Liu 899a3114836SGerry Liu acpidev_dr_free_cpu_numa_info(hdl); 900a3114836SGerry Liu break; 901a3114836SGerry Liu 902a3114836SGerry Liu case LGRP_CONFIG_MEM_ADD: 903a3114836SGerry Liu mp = (update_membounds_t *)arg; 904a3114836SGerry Liu ASSERT(mp != NULL); 905a3114836SGerry Liu 906a3114836SGerry Liu /* Update latency information among lgrps. */ 907a3114836SGerry Liu if (mp->u_sli_cnt != 0 && mp->u_sli_ptr != NULL) { 908a3114836SGerry Liu if (lgrp_plat_process_sli(mp->u_domain, 909a3114836SGerry Liu mp->u_sli_ptr, mp->u_sli_cnt, 910a3114836SGerry Liu lgrp_plat_node_domain, lgrp_plat_node_cnt, 911a3114836SGerry Liu &lgrp_plat_lat_stats) != 0) { 912a3114836SGerry Liu cmn_err(CE_WARN, "!lgrp: failed to update " 913a3114836SGerry Liu "latency information for domain (%u).", 914a3114836SGerry Liu domain); 915a3114836SGerry Liu } 916a3114836SGerry Liu } 917a3114836SGerry Liu 918a3114836SGerry Liu if (lgrp_plat_memnode_info_update(lgrp_plat_node_domain, 919a3114836SGerry Liu lgrp_plat_node_cnt, lgrp_plat_memnode_info, max_mem_nodes, 920a3114836SGerry Liu mp->u_base, mp->u_base + mp->u_length, 921a3114836SGerry Liu mp->u_domain, mp->u_device_id) < 0) { 922a3114836SGerry Liu cmn_err(CE_WARN, 923a3114836SGerry Liu "!lgrp: failed to update latency information for " 924a3114836SGerry Liu "memory (0x%" PRIx64 " - 0x%" PRIx64 ").", 925a3114836SGerry Liu mp->u_base, mp->u_base + mp->u_length); 926a3114836SGerry Liu } 927a3114836SGerry Liu break; 928a3114836SGerry Liu 929a3114836SGerry Liu default: 930a3114836SGerry Liu break; 931a3114836SGerry Liu } 932a3114836SGerry Liu #endif /* __xpv */ 9332e2c009bSjjc } 9342e2c009bSjjc 9352e2c009bSjjc 9362e2c009bSjjc /* 9372e2c009bSjjc * Return the platform handle for the lgroup containing the given CPU 9382e2c009bSjjc */ 9392e2c009bSjjc lgrp_handle_t 9402e2c009bSjjc lgrp_plat_cpu_to_hand(processorid_t id) 9412e2c009bSjjc { 9422e2c009bSjjc lgrp_handle_t hand; 9432e2c009bSjjc 944a3114836SGerry Liu ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock)); 945a3114836SGerry Liu 9462e2c009bSjjc if (lgrp_plat_node_cnt == 1) 9472e2c009bSjjc return (LGRP_DEFAULT_HANDLE); 9482e2c009bSjjc 9492e2c009bSjjc hand = (lgrp_handle_t)lgrp_plat_cpu_to_node(cpu[id], 950d5d7cf4eSJonathan Chew lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries); 9512e2c009bSjjc 9522e2c009bSjjc ASSERT(hand != (lgrp_handle_t)-1); 9532e2c009bSjjc if (hand == (lgrp_handle_t)-1) 9542e2c009bSjjc return (LGRP_NULL_HANDLE); 9552e2c009bSjjc 9562e2c009bSjjc return (hand); 9572e2c009bSjjc } 9582e2c009bSjjc 9592e2c009bSjjc 9602e2c009bSjjc /* 9612e2c009bSjjc * Platform-specific initialization of lgroups 9622e2c009bSjjc */ 9632e2c009bSjjc void 964d5d7cf4eSJonathan Chew lgrp_plat_init(lgrp_init_stages_t stage) 9652e2c009bSjjc { 9662e2c009bSjjc #if defined(__xpv) 967d5d7cf4eSJonathan Chew #else /* __xpv */ 968d5d7cf4eSJonathan Chew u_longlong_t value; 969d5d7cf4eSJonathan Chew #endif /* __xpv */ 970d5d7cf4eSJonathan Chew 971d5d7cf4eSJonathan Chew switch (stage) { 972d5d7cf4eSJonathan Chew case LGRP_INIT_STAGE1: 973d5d7cf4eSJonathan Chew #if defined(__xpv) 9742e2c009bSjjc /* 9752e2c009bSjjc * XXPV For now, the hypervisor treats all memory equally. 9762e2c009bSjjc */ 9772e2c009bSjjc lgrp_plat_node_cnt = max_mem_nodes = 1; 9782e2c009bSjjc #else /* __xpv */ 979a3114836SGerry Liu 9802baa66a0SJonathan Chew /* 9812baa66a0SJonathan Chew * Get boot property for lgroup topology height limit 9822baa66a0SJonathan Chew */ 9832baa66a0SJonathan Chew if (bootprop_getval(BP_LGRP_TOPO_LEVELS, &value) == 0) 9842baa66a0SJonathan Chew (void) lgrp_topo_ht_limit_set((int)value); 9852baa66a0SJonathan Chew 9862baa66a0SJonathan Chew /* 9872baa66a0SJonathan Chew * Get boot property for enabling/disabling SRAT 9882baa66a0SJonathan Chew */ 9892baa66a0SJonathan Chew if (bootprop_getval(BP_LGRP_SRAT_ENABLE, &value) == 0) 9902baa66a0SJonathan Chew lgrp_plat_srat_enable = (int)value; 9912baa66a0SJonathan Chew 9922baa66a0SJonathan Chew /* 9932baa66a0SJonathan Chew * Get boot property for enabling/disabling SLIT 9942baa66a0SJonathan Chew */ 9952baa66a0SJonathan Chew if (bootprop_getval(BP_LGRP_SLIT_ENABLE, &value) == 0) 9962baa66a0SJonathan Chew lgrp_plat_slit_enable = (int)value; 9972e2c009bSjjc 9982e2c009bSjjc /* 999a3114836SGerry Liu * Get boot property for enabling/disabling MSCT 1000a3114836SGerry Liu */ 1001a3114836SGerry Liu if (bootprop_getval(BP_LGRP_MSCT_ENABLE, &value) == 0) 1002a3114836SGerry Liu lgrp_plat_msct_enable = (int)value; 1003a3114836SGerry Liu 1004a3114836SGerry Liu /* 10052e2c009bSjjc * Initialize as a UMA machine 10062e2c009bSjjc */ 10072e2c009bSjjc if (lgrp_topo_ht_limit() == 1) { 10082e2c009bSjjc lgrp_plat_node_cnt = max_mem_nodes = 1; 1009a3114836SGerry Liu lgrp_plat_max_mem_node = 1; 10102e2c009bSjjc return; 10112e2c009bSjjc } 10122e2c009bSjjc 1013d5d7cf4eSJonathan Chew lgrp_plat_get_numa_config(); 1014a3114836SGerry Liu 1015a3114836SGerry Liu /* 1016a3114836SGerry Liu * Each lgrp node needs MAX_MEM_NODES_PER_LGROUP memnodes 1017a3114836SGerry Liu * to support memory DR operations if memory DR is enabled. 1018a3114836SGerry Liu */ 1019a3114836SGerry Liu lgrp_plat_max_mem_node = lgrp_plat_node_cnt; 1020a3114836SGerry Liu if (plat_dr_support_memory() && lgrp_plat_node_cnt != 1) { 1021a3114836SGerry Liu max_mem_nodes = MAX_MEM_NODES_PER_LGROUP * 1022a3114836SGerry Liu lgrp_plat_node_cnt; 1023a3114836SGerry Liu ASSERT(max_mem_nodes <= MAX_MEM_NODES); 1024a3114836SGerry Liu } 10252e2c009bSjjc #endif /* __xpv */ 1026d5d7cf4eSJonathan Chew break; 1027d5d7cf4eSJonathan Chew 1028d5d7cf4eSJonathan Chew case LGRP_INIT_STAGE3: 1029d5d7cf4eSJonathan Chew lgrp_plat_probe(); 1030d5d7cf4eSJonathan Chew lgrp_plat_release_bootstrap(); 1031d5d7cf4eSJonathan Chew break; 1032d5d7cf4eSJonathan Chew 1033d5d7cf4eSJonathan Chew case LGRP_INIT_STAGE4: 1034d5d7cf4eSJonathan Chew lgrp_plat_main_init(); 1035d5d7cf4eSJonathan Chew break; 1036d5d7cf4eSJonathan Chew 1037d5d7cf4eSJonathan Chew default: 1038d5d7cf4eSJonathan Chew break; 1039d5d7cf4eSJonathan Chew } 10402e2c009bSjjc } 10412e2c009bSjjc 10422e2c009bSjjc 10432e2c009bSjjc /* 10442e2c009bSjjc * Return latency between "from" and "to" lgroups 10452e2c009bSjjc * 10462e2c009bSjjc * This latency number can only be used for relative comparison 10472e2c009bSjjc * between lgroups on the running system, cannot be used across platforms, 10482e2c009bSjjc * and may not reflect the actual latency. It is platform and implementation 10492e2c009bSjjc * specific, so platform gets to decide its value. It would be nice if the 10502e2c009bSjjc * number was at least proportional to make comparisons more meaningful though. 10512e2c009bSjjc */ 10522e2c009bSjjc int 10532e2c009bSjjc lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 10542e2c009bSjjc { 10552e2c009bSjjc lgrp_handle_t src, dest; 10562e2c009bSjjc int node; 10572e2c009bSjjc 10582e2c009bSjjc if (max_mem_nodes == 1) 10592e2c009bSjjc return (0); 10602e2c009bSjjc 10612e2c009bSjjc /* 10622e2c009bSjjc * Return max latency for root lgroup 10632e2c009bSjjc */ 10642e2c009bSjjc if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 10652e2c009bSjjc return (lgrp_plat_lat_stats.latency_max); 10662e2c009bSjjc 10672e2c009bSjjc src = from; 10682e2c009bSjjc dest = to; 10692e2c009bSjjc 10702e2c009bSjjc /* 10712e2c009bSjjc * Return 0 for nodes (lgroup platform handles) out of range 10722e2c009bSjjc */ 10732e2c009bSjjc if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 10742e2c009bSjjc return (0); 10752e2c009bSjjc 10762e2c009bSjjc /* 10772e2c009bSjjc * Probe from current CPU if its lgroup latencies haven't been set yet 1078a3114836SGerry Liu * and we are trying to get latency from current CPU to some node. 1079a3114836SGerry Liu * Avoid probing if CPU/memory DR is enabled. 10802e2c009bSjjc */ 1081a3114836SGerry Liu if (lgrp_plat_lat_stats.latencies[src][src] == 0) { 1082a3114836SGerry Liu /* 1083a3114836SGerry Liu * Latency information should be updated by lgrp_plat_config() 1084a3114836SGerry Liu * for DR operations. Something is wrong if reaches here. 1085a3114836SGerry Liu * For safety, flatten lgrp topology to two levels. 1086a3114836SGerry Liu */ 1087a3114836SGerry Liu if (plat_dr_support_cpu() || plat_dr_support_memory()) { 1088a3114836SGerry Liu ASSERT(lgrp_plat_lat_stats.latencies[src][src]); 1089a3114836SGerry Liu cmn_err(CE_WARN, 1090a3114836SGerry Liu "lgrp: failed to get latency information, " 1091a3114836SGerry Liu "fall back to two-level topology."); 1092a3114836SGerry Liu lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 1093a3114836SGerry Liu } else { 1094d5d7cf4eSJonathan Chew node = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1095d5d7cf4eSJonathan Chew lgrp_plat_cpu_node_nentries); 10962e2c009bSjjc ASSERT(node >= 0 && node < lgrp_plat_node_cnt); 1097a3114836SGerry Liu if (node == src) 10982e2c009bSjjc lgrp_plat_probe(); 1099a3114836SGerry Liu } 1100a3114836SGerry Liu } 11012e2c009bSjjc 11022e2c009bSjjc return (lgrp_plat_lat_stats.latencies[src][dest]); 11032e2c009bSjjc } 11042e2c009bSjjc 11052e2c009bSjjc 11062e2c009bSjjc /* 11072e2c009bSjjc * Return the maximum number of lgrps supported by the platform. 11082e2c009bSjjc * Before lgrp topology is known it returns an estimate based on the number of 1109a3114836SGerry Liu * nodes. Once topology is known it returns: 1110a3114836SGerry Liu * 1) the actual maximim number of lgrps created if CPU/memory DR operations 1111a3114836SGerry Liu * are not suppported. 1112a3114836SGerry Liu * 2) the maximum possible number of lgrps if CPU/memory DR operations are 1113a3114836SGerry Liu * supported. 11142e2c009bSjjc */ 11152e2c009bSjjc int 11162e2c009bSjjc lgrp_plat_max_lgrps(void) 11172e2c009bSjjc { 1118a3114836SGerry Liu if (!lgrp_topo_initialized || plat_dr_support_cpu() || 1119a3114836SGerry Liu plat_dr_support_memory()) { 1120a3114836SGerry Liu return (lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 1121a3114836SGerry Liu } else { 1122a3114836SGerry Liu return (lgrp_alloc_max + 1); 1123a3114836SGerry Liu } 11242e2c009bSjjc } 11252e2c009bSjjc 11262e2c009bSjjc 11272e2c009bSjjc /* 1128a3114836SGerry Liu * Count number of memory pages (_t) based on mnode id (_n) and query type (_t). 1129a3114836SGerry Liu */ 1130a3114836SGerry Liu #define _LGRP_PLAT_MEM_SIZE(_n, _q, _t) \ 1131a3114836SGerry Liu if (mem_node_config[_n].exists) { \ 1132a3114836SGerry Liu switch (_q) { \ 1133a3114836SGerry Liu case LGRP_MEM_SIZE_FREE: \ 1134a3114836SGerry Liu _t += MNODE_PGCNT(_n); \ 1135a3114836SGerry Liu break; \ 1136a3114836SGerry Liu case LGRP_MEM_SIZE_AVAIL: \ 1137a3114836SGerry Liu _t += mem_node_memlist_pages(_n, phys_avail); \ 1138a3114836SGerry Liu break; \ 1139a3114836SGerry Liu case LGRP_MEM_SIZE_INSTALL: \ 1140a3114836SGerry Liu _t += mem_node_memlist_pages(_n, phys_install); \ 1141a3114836SGerry Liu break; \ 1142a3114836SGerry Liu default: \ 1143a3114836SGerry Liu break; \ 1144a3114836SGerry Liu } \ 1145a3114836SGerry Liu } 1146a3114836SGerry Liu 1147a3114836SGerry Liu /* 11482e2c009bSjjc * Return the number of free pages in an lgroup. 11492e2c009bSjjc * 11502e2c009bSjjc * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 11512e2c009bSjjc * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 11522e2c009bSjjc * number of allocatable base pagesize pages corresponding to the 11532e2c009bSjjc * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 11542e2c009bSjjc * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 11552e2c009bSjjc * memory installed, regardless of whether or not it's usable. 11562e2c009bSjjc */ 11572e2c009bSjjc pgcnt_t 11582e2c009bSjjc lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 11592e2c009bSjjc { 11602e2c009bSjjc int mnode; 11612e2c009bSjjc pgcnt_t npgs = (pgcnt_t)0; 11622e2c009bSjjc extern struct memlist *phys_avail; 11632e2c009bSjjc extern struct memlist *phys_install; 11642e2c009bSjjc 11652e2c009bSjjc 11662e2c009bSjjc if (plathand == LGRP_DEFAULT_HANDLE) 11672e2c009bSjjc return (lgrp_plat_mem_size_default(plathand, query)); 11682e2c009bSjjc 11692e2c009bSjjc if (plathand != LGRP_NULL_HANDLE) { 1170a3114836SGerry Liu /* Count memory node present at boot. */ 1171a3114836SGerry Liu mnode = (int)plathand; 1172a3114836SGerry Liu ASSERT(mnode < lgrp_plat_node_cnt); 1173a3114836SGerry Liu _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); 1174a3114836SGerry Liu 1175a3114836SGerry Liu /* Count possible hot-added memory nodes. */ 1176a3114836SGerry Liu for (mnode = lgrp_plat_node_cnt; 1177a3114836SGerry Liu mnode < lgrp_plat_max_mem_node; mnode++) { 1178a3114836SGerry Liu if (lgrp_plat_memnode_info[mnode].lgrphand == plathand) 1179a3114836SGerry Liu _LGRP_PLAT_MEM_SIZE(mnode, query, npgs); 11802e2c009bSjjc } 11812e2c009bSjjc } 1182a3114836SGerry Liu 11832e2c009bSjjc return (npgs); 11842e2c009bSjjc } 11852e2c009bSjjc 11862e2c009bSjjc 11872e2c009bSjjc /* 11882e2c009bSjjc * Return the platform handle of the lgroup that contains the physical memory 11892e2c009bSjjc * corresponding to the given page frame number 11902e2c009bSjjc */ 11912e2c009bSjjc lgrp_handle_t 11922e2c009bSjjc lgrp_plat_pfn_to_hand(pfn_t pfn) 11932e2c009bSjjc { 11942e2c009bSjjc int mnode; 11952e2c009bSjjc 11962e2c009bSjjc if (max_mem_nodes == 1) 11972e2c009bSjjc return (LGRP_DEFAULT_HANDLE); 11982e2c009bSjjc 11992e2c009bSjjc if (pfn > physmax) 12002e2c009bSjjc return (LGRP_NULL_HANDLE); 12012e2c009bSjjc 12022e2c009bSjjc mnode = plat_pfn_to_mem_node(pfn); 12032e2c009bSjjc if (mnode < 0) 12042e2c009bSjjc return (LGRP_NULL_HANDLE); 12052e2c009bSjjc 12062e2c009bSjjc return (MEM_NODE_2_LGRPHAND(mnode)); 12072e2c009bSjjc } 12082e2c009bSjjc 12092e2c009bSjjc 12102e2c009bSjjc /* 12112e2c009bSjjc * Probe memory in each node from current CPU to determine latency topology 12122e2c009bSjjc * 12132e2c009bSjjc * The probing code will probe the vendor ID register on the Northbridge of 12142e2c009bSjjc * Opteron processors and probe memory for other processors by default. 12152e2c009bSjjc * 12162e2c009bSjjc * Since probing is inherently error prone, the code takes laps across all the 12172e2c009bSjjc * nodes probing from each node to each of the other nodes some number of 12182e2c009bSjjc * times. Furthermore, each node is probed some number of times before moving 12192e2c009bSjjc * onto the next one during each lap. The minimum latency gotten between nodes 12202e2c009bSjjc * is kept as the latency between the nodes. 12212e2c009bSjjc * 12222e2c009bSjjc * After all that, the probe times are adjusted by normalizing values that are 12232e2c009bSjjc * close to each other and local latencies are made the same. Lastly, the 12242e2c009bSjjc * latencies are verified to make sure that certain conditions are met (eg. 12252e2c009bSjjc * local < remote, latency(a, b) == latency(b, a), etc.). 12262e2c009bSjjc * 12272e2c009bSjjc * If any of the conditions aren't met, the code will export a NUMA 12282e2c009bSjjc * configuration with the local CPUs and memory given by the SRAT or PCI config 12292e2c009bSjjc * space registers and one remote memory latency since it can't tell exactly 12302e2c009bSjjc * how far each node is from each other. 12312e2c009bSjjc */ 12322e2c009bSjjc void 12332e2c009bSjjc lgrp_plat_probe(void) 12342e2c009bSjjc { 12352e2c009bSjjc int from; 12362e2c009bSjjc int i; 12372e2c009bSjjc lgrp_plat_latency_stats_t *lat_stats; 12381ce8847aSJonathan Chew boolean_t probed; 12392e2c009bSjjc hrtime_t probe_time; 12402e2c009bSjjc int to; 12412e2c009bSjjc 12422e2c009bSjjc if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 12432e2c009bSjjc max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 12442e2c009bSjjc return; 12452e2c009bSjjc 1246a3114836SGerry Liu /* SRAT and SLIT should be enabled if DR operations are enabled. */ 1247a3114836SGerry Liu if (plat_dr_support_cpu() || plat_dr_support_memory()) 1248a3114836SGerry Liu return; 1249a3114836SGerry Liu 12502e2c009bSjjc /* 12512e2c009bSjjc * Determine ID of node containing current CPU 12522e2c009bSjjc */ 1253d5d7cf4eSJonathan Chew from = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 1254d5d7cf4eSJonathan Chew lgrp_plat_cpu_node_nentries); 12552e2c009bSjjc ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 12562e2c009bSjjc if (srat_ptr && lgrp_plat_srat_enable && !lgrp_plat_srat_error) 12572e2c009bSjjc ASSERT(lgrp_plat_node_domain[from].exists); 12582e2c009bSjjc 12592e2c009bSjjc /* 12602e2c009bSjjc * Don't need to probe if got times already 12612e2c009bSjjc */ 12622e2c009bSjjc lat_stats = &lgrp_plat_lat_stats; 12632e2c009bSjjc if (lat_stats->latencies[from][from] != 0) 12642e2c009bSjjc return; 12652e2c009bSjjc 12662e2c009bSjjc /* 12672e2c009bSjjc * Read vendor ID in Northbridge or read and write page(s) 12682e2c009bSjjc * in each node from current CPU and remember how long it takes, 12692e2c009bSjjc * so we can build latency topology of machine later. 12702e2c009bSjjc * This should approximate the memory latency between each node. 12712e2c009bSjjc */ 12721ce8847aSJonathan Chew probed = B_FALSE; 12732e2c009bSjjc for (i = 0; i < lgrp_plat_probe_nrounds; i++) { 12742e2c009bSjjc for (to = 0; to < lgrp_plat_node_cnt; to++) { 12752e2c009bSjjc /* 12761ce8847aSJonathan Chew * Get probe time and skip over any nodes that can't be 12771ce8847aSJonathan Chew * probed yet or don't have memory 12782e2c009bSjjc */ 12792e2c009bSjjc probe_time = lgrp_plat_probe_time(to, 1280d5d7cf4eSJonathan Chew lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries, 1281d5d7cf4eSJonathan Chew &lgrp_plat_probe_mem_config, &lgrp_plat_lat_stats, 1282d5d7cf4eSJonathan Chew &lgrp_plat_probe_stats); 12832e2c009bSjjc if (probe_time == 0) 12841ce8847aSJonathan Chew continue; 12851ce8847aSJonathan Chew 12861ce8847aSJonathan Chew probed = B_TRUE; 12872e2c009bSjjc 12882e2c009bSjjc /* 12892e2c009bSjjc * Keep lowest probe time as latency between nodes 12902e2c009bSjjc */ 12912e2c009bSjjc if (lat_stats->latencies[from][to] == 0 || 12922e2c009bSjjc probe_time < lat_stats->latencies[from][to]) 12932e2c009bSjjc lat_stats->latencies[from][to] = probe_time; 12942e2c009bSjjc 12952e2c009bSjjc /* 12962e2c009bSjjc * Update overall minimum and maximum probe times 12972e2c009bSjjc * across all nodes 12982e2c009bSjjc */ 12992e2c009bSjjc if (probe_time < lat_stats->latency_min || 13002e2c009bSjjc lat_stats->latency_min == -1) 13012e2c009bSjjc lat_stats->latency_min = probe_time; 13022e2c009bSjjc if (probe_time > lat_stats->latency_max) 13032e2c009bSjjc lat_stats->latency_max = probe_time; 13042e2c009bSjjc } 13052e2c009bSjjc } 13062e2c009bSjjc 13072e2c009bSjjc /* 13081ce8847aSJonathan Chew * Bail out if weren't able to probe any nodes from current CPU 13091ce8847aSJonathan Chew */ 13101ce8847aSJonathan Chew if (probed == B_FALSE) 13111ce8847aSJonathan Chew return; 13121ce8847aSJonathan Chew 13131ce8847aSJonathan Chew /* 13142e2c009bSjjc * - Fix up latencies such that local latencies are same, 13152e2c009bSjjc * latency(i, j) == latency(j, i), etc. (if possible) 13162e2c009bSjjc * 13172e2c009bSjjc * - Verify that latencies look ok 13182e2c009bSjjc * 13192e2c009bSjjc * - Fallback to just optimizing for local and remote if 13202e2c009bSjjc * latencies didn't look right 13212e2c009bSjjc */ 1322a3114836SGerry Liu lgrp_plat_latency_adjust(lgrp_plat_memnode_info, &lgrp_plat_lat_stats, 13232e2c009bSjjc &lgrp_plat_probe_stats); 13242e2c009bSjjc lgrp_plat_probe_stats.probe_error_code = 1325a3114836SGerry Liu lgrp_plat_latency_verify(lgrp_plat_memnode_info, 13262e2c009bSjjc &lgrp_plat_lat_stats); 13272e2c009bSjjc if (lgrp_plat_probe_stats.probe_error_code) 1328a3114836SGerry Liu lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 13292e2c009bSjjc } 13302e2c009bSjjc 13312e2c009bSjjc 13322e2c009bSjjc /* 13332e2c009bSjjc * Return platform handle for root lgroup 13342e2c009bSjjc */ 13352e2c009bSjjc lgrp_handle_t 13362e2c009bSjjc lgrp_plat_root_hand(void) 13372e2c009bSjjc { 13382e2c009bSjjc return (LGRP_DEFAULT_HANDLE); 13392e2c009bSjjc } 13402e2c009bSjjc 13412e2c009bSjjc 13422e2c009bSjjc /* 13432e2c009bSjjc * INTERNAL ROUTINES 13442e2c009bSjjc */ 13452e2c009bSjjc 13462e2c009bSjjc 13472e2c009bSjjc /* 1348a3114836SGerry Liu * Update CPU to node mapping for given CPU and proximity domain. 1349a3114836SGerry Liu * Return values: 1350a3114836SGerry Liu * - zero for success 1351a3114836SGerry Liu * - positive numbers for warnings 1352a3114836SGerry Liu * - negative numbers for errors 13532e2c009bSjjc */ 13542e2c009bSjjc static int 1355d821f0f0Sjjc lgrp_plat_cpu_node_update(node_domain_map_t *node_domain, int node_cnt, 1356dae2fa37Sjjc cpu_node_map_t *cpu_node, int nentries, uint32_t apicid, uint32_t domain) 13572e2c009bSjjc { 13582e2c009bSjjc uint_t i; 13592e2c009bSjjc int node; 13602e2c009bSjjc 13612e2c009bSjjc /* 13622e2c009bSjjc * Get node number for proximity domain 13632e2c009bSjjc */ 1364d821f0f0Sjjc node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 13652e2c009bSjjc if (node == -1) { 1366d821f0f0Sjjc node = lgrp_plat_node_domain_update(node_domain, node_cnt, 1367d821f0f0Sjjc domain); 13682e2c009bSjjc if (node == -1) 13692e2c009bSjjc return (-1); 13702e2c009bSjjc } 13712e2c009bSjjc 13722e2c009bSjjc /* 1373dae2fa37Sjjc * Search for entry with given APIC ID and fill in its node and 1374dae2fa37Sjjc * proximity domain IDs (if they haven't been set already) 13752e2c009bSjjc */ 1376dae2fa37Sjjc for (i = 0; i < nentries; i++) { 13772e2c009bSjjc /* 1378dae2fa37Sjjc * Skip nonexistent entries and ones without matching APIC ID 13792e2c009bSjjc */ 1380dae2fa37Sjjc if (!cpu_node[i].exists || cpu_node[i].apicid != apicid) 1381dae2fa37Sjjc continue; 1382dae2fa37Sjjc 13832e2c009bSjjc /* 1384dae2fa37Sjjc * Just return if entry completely and correctly filled in 1385dae2fa37Sjjc * already 13862e2c009bSjjc */ 13872e2c009bSjjc if (cpu_node[i].prox_domain == domain && 13882e2c009bSjjc cpu_node[i].node == node) 13892e2c009bSjjc return (1); 13902e2c009bSjjc 13912e2c009bSjjc /* 1392a3114836SGerry Liu * It's invalid to have more than one entry with the same 1393a3114836SGerry Liu * local APIC ID in SRAT table. 1394a3114836SGerry Liu */ 1395a3114836SGerry Liu if (cpu_node[i].node != UINT_MAX) 1396a3114836SGerry Liu return (-2); 1397a3114836SGerry Liu 1398a3114836SGerry Liu /* 1399dae2fa37Sjjc * Fill in node and proximity domain IDs 14002e2c009bSjjc */ 14012e2c009bSjjc cpu_node[i].prox_domain = domain; 14022e2c009bSjjc cpu_node[i].node = node; 1403dae2fa37Sjjc 14042e2c009bSjjc return (0); 14052e2c009bSjjc } 14062e2c009bSjjc 14072e2c009bSjjc /* 1408a3114836SGerry Liu * It's possible that an apicid doesn't exist in the cpu_node map due 1409a3114836SGerry Liu * to user limits number of CPUs powered on at boot by specifying the 1410a3114836SGerry Liu * boot_ncpus kernel option. 14112e2c009bSjjc */ 1412a3114836SGerry Liu return (2); 14132e2c009bSjjc } 14142e2c009bSjjc 14152e2c009bSjjc 14162e2c009bSjjc /* 1417dae2fa37Sjjc * Get node ID for given CPU 14182e2c009bSjjc */ 14192e2c009bSjjc static int 1420d5d7cf4eSJonathan Chew lgrp_plat_cpu_to_node(cpu_t *cp, cpu_node_map_t *cpu_node, 1421d5d7cf4eSJonathan Chew int cpu_node_nentries) 14222e2c009bSjjc { 1423dae2fa37Sjjc processorid_t cpuid; 14242e2c009bSjjc 14252e2c009bSjjc if (cp == NULL) 14262e2c009bSjjc return (-1); 14272e2c009bSjjc 1428dae2fa37Sjjc cpuid = cp->cpu_id; 1429dae2fa37Sjjc if (cpuid < 0 || cpuid >= max_ncpus) 1430dae2fa37Sjjc return (-1); 1431dae2fa37Sjjc 14322e2c009bSjjc /* 14332e2c009bSjjc * SRAT doesn't exist, isn't enabled, or there was an error processing 14348031591dSSrihari Venkatesan * it, so return node ID for Opteron and -1 otherwise. 14352e2c009bSjjc */ 14362e2c009bSjjc if (srat_ptr == NULL || !lgrp_plat_srat_enable || 14372e2c009bSjjc lgrp_plat_srat_error) { 14382e2c009bSjjc if (is_opteron()) 14398031591dSSrihari Venkatesan return (pg_plat_hw_instance_id(cp, PGHW_PROCNODE)); 14402e2c009bSjjc return (-1); 14412e2c009bSjjc } 14422e2c009bSjjc 14432e2c009bSjjc /* 1444dae2fa37Sjjc * Return -1 when CPU to node ID mapping entry doesn't exist for given 1445dae2fa37Sjjc * CPU 14462e2c009bSjjc */ 1447d5d7cf4eSJonathan Chew if (cpuid >= cpu_node_nentries || !cpu_node[cpuid].exists) 14482e2c009bSjjc return (-1); 1449dae2fa37Sjjc 1450dae2fa37Sjjc return (cpu_node[cpuid].node); 14512e2c009bSjjc } 14522e2c009bSjjc 14532e2c009bSjjc 14542e2c009bSjjc /* 14552e2c009bSjjc * Return node number for given proximity domain/system locality 14562e2c009bSjjc */ 14572e2c009bSjjc static int 1458d821f0f0Sjjc lgrp_plat_domain_to_node(node_domain_map_t *node_domain, int node_cnt, 1459d821f0f0Sjjc uint32_t domain) 14602e2c009bSjjc { 14612e2c009bSjjc uint_t node; 14622e2c009bSjjc uint_t start; 14632e2c009bSjjc 14642e2c009bSjjc /* 14652e2c009bSjjc * Hash proximity domain ID into node to domain mapping table (array), 14662e2c009bSjjc * search for entry with matching proximity domain ID, and return index 14672e2c009bSjjc * of matching entry as node ID. 14682e2c009bSjjc */ 1469d821f0f0Sjjc node = start = NODE_DOMAIN_HASH(domain, node_cnt); 14702e2c009bSjjc do { 1471a3114836SGerry Liu if (node_domain[node].exists) { 1472a3114836SGerry Liu membar_consumer(); 1473a3114836SGerry Liu if (node_domain[node].prox_domain == domain) 14742e2c009bSjjc return (node); 1475a3114836SGerry Liu } 1476cf5755f2SJonathan Chew node = (node + 1) % node_cnt; 14772e2c009bSjjc } while (node != start); 14782e2c009bSjjc return (-1); 14792e2c009bSjjc } 14802e2c009bSjjc 14812e2c009bSjjc 14822e2c009bSjjc /* 1483d5d7cf4eSJonathan Chew * Get NUMA configuration of machine 1484d5d7cf4eSJonathan Chew */ 1485d5d7cf4eSJonathan Chew static void 1486d5d7cf4eSJonathan Chew lgrp_plat_get_numa_config(void) 1487d5d7cf4eSJonathan Chew { 1488d5d7cf4eSJonathan Chew uint_t probe_op; 1489d5d7cf4eSJonathan Chew 1490d5d7cf4eSJonathan Chew /* 1491d5d7cf4eSJonathan Chew * Read boot property with CPU to APIC ID mapping table/array to 1492d5d7cf4eSJonathan Chew * determine number of CPUs 1493d5d7cf4eSJonathan Chew */ 1494d5d7cf4eSJonathan Chew lgrp_plat_apic_ncpus = lgrp_plat_process_cpu_apicids(NULL); 1495d5d7cf4eSJonathan Chew 1496d5d7cf4eSJonathan Chew /* 1497d5d7cf4eSJonathan Chew * Determine which CPUs and memory are local to each other and number 1498d5d7cf4eSJonathan Chew * of NUMA nodes by reading ACPI System Resource Affinity Table (SRAT) 1499d5d7cf4eSJonathan Chew */ 1500d5d7cf4eSJonathan Chew if (lgrp_plat_apic_ncpus > 0) { 1501d5d7cf4eSJonathan Chew int retval; 1502d5d7cf4eSJonathan Chew 1503a3114836SGerry Liu /* Reserve enough resources if CPU DR is enabled. */ 1504a3114836SGerry Liu if (plat_dr_support_cpu() && max_ncpus > lgrp_plat_apic_ncpus) 1505a3114836SGerry Liu lgrp_plat_cpu_node_nentries = max_ncpus; 1506a3114836SGerry Liu else 1507a3114836SGerry Liu lgrp_plat_cpu_node_nentries = lgrp_plat_apic_ncpus; 1508a3114836SGerry Liu 1509d5d7cf4eSJonathan Chew /* 1510d5d7cf4eSJonathan Chew * Temporarily allocate boot memory to use for CPU to node 1511d5d7cf4eSJonathan Chew * mapping since kernel memory allocator isn't alive yet 1512d5d7cf4eSJonathan Chew */ 1513d5d7cf4eSJonathan Chew lgrp_plat_cpu_node = (cpu_node_map_t *)BOP_ALLOC(bootops, 1514a3114836SGerry Liu NULL, lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t), 1515d5d7cf4eSJonathan Chew sizeof (int)); 1516d5d7cf4eSJonathan Chew 1517d5d7cf4eSJonathan Chew ASSERT(lgrp_plat_cpu_node != NULL); 1518d5d7cf4eSJonathan Chew if (lgrp_plat_cpu_node) { 1519d5d7cf4eSJonathan Chew bzero(lgrp_plat_cpu_node, lgrp_plat_cpu_node_nentries * 1520d5d7cf4eSJonathan Chew sizeof (cpu_node_map_t)); 1521a3114836SGerry Liu } else { 1522a3114836SGerry Liu lgrp_plat_cpu_node_nentries = 0; 1523d5d7cf4eSJonathan Chew } 1524d5d7cf4eSJonathan Chew 1525d5d7cf4eSJonathan Chew /* 1526d5d7cf4eSJonathan Chew * Fill in CPU to node ID mapping table with APIC ID for each 1527d5d7cf4eSJonathan Chew * CPU 1528d5d7cf4eSJonathan Chew */ 1529d5d7cf4eSJonathan Chew (void) lgrp_plat_process_cpu_apicids(lgrp_plat_cpu_node); 1530d5d7cf4eSJonathan Chew 1531a3114836SGerry Liu retval = lgrp_plat_process_srat(srat_ptr, msct_ptr, 1532d5d7cf4eSJonathan Chew &lgrp_plat_prox_domain_min, 1533d5d7cf4eSJonathan Chew lgrp_plat_node_domain, lgrp_plat_cpu_node, 1534a3114836SGerry Liu lgrp_plat_apic_ncpus, lgrp_plat_memnode_info); 1535d5d7cf4eSJonathan Chew if (retval <= 0) { 1536d5d7cf4eSJonathan Chew lgrp_plat_srat_error = retval; 1537d5d7cf4eSJonathan Chew lgrp_plat_node_cnt = 1; 1538d5d7cf4eSJonathan Chew } else { 1539d5d7cf4eSJonathan Chew lgrp_plat_srat_error = 0; 1540d5d7cf4eSJonathan Chew lgrp_plat_node_cnt = retval; 1541d5d7cf4eSJonathan Chew } 1542d5d7cf4eSJonathan Chew } 1543d5d7cf4eSJonathan Chew 1544d5d7cf4eSJonathan Chew /* 1545d5d7cf4eSJonathan Chew * Try to use PCI config space registers on Opteron if there's an error 1546d5d7cf4eSJonathan Chew * processing CPU to APIC ID mapping or SRAT 1547d5d7cf4eSJonathan Chew */ 1548d5d7cf4eSJonathan Chew if ((lgrp_plat_apic_ncpus <= 0 || lgrp_plat_srat_error != 0) && 1549d5d7cf4eSJonathan Chew is_opteron()) 1550d5d7cf4eSJonathan Chew opt_get_numa_config(&lgrp_plat_node_cnt, &lgrp_plat_mem_intrlv, 1551a3114836SGerry Liu lgrp_plat_memnode_info); 1552d5d7cf4eSJonathan Chew 1553d5d7cf4eSJonathan Chew /* 1554d5d7cf4eSJonathan Chew * Don't bother to setup system for multiple lgroups and only use one 1555d5d7cf4eSJonathan Chew * memory node when memory is interleaved between any nodes or there is 1556d5d7cf4eSJonathan Chew * only one NUMA node 1557d5d7cf4eSJonathan Chew */ 1558d5d7cf4eSJonathan Chew if (lgrp_plat_mem_intrlv || lgrp_plat_node_cnt == 1) { 1559d5d7cf4eSJonathan Chew lgrp_plat_node_cnt = max_mem_nodes = 1; 1560d5d7cf4eSJonathan Chew (void) lgrp_topo_ht_limit_set(1); 1561d5d7cf4eSJonathan Chew return; 1562d5d7cf4eSJonathan Chew } 1563d5d7cf4eSJonathan Chew 1564d5d7cf4eSJonathan Chew /* 1565d5d7cf4eSJonathan Chew * Leaf lgroups on x86/x64 architectures contain one physical 1566d5d7cf4eSJonathan Chew * processor chip. Tune lgrp_expand_proc_thresh and 1567d5d7cf4eSJonathan Chew * lgrp_expand_proc_diff so that lgrp_choose() will spread 1568d5d7cf4eSJonathan Chew * things out aggressively. 1569d5d7cf4eSJonathan Chew */ 1570d5d7cf4eSJonathan Chew lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 1571d5d7cf4eSJonathan Chew lgrp_expand_proc_diff = 0; 1572d5d7cf4eSJonathan Chew 1573d5d7cf4eSJonathan Chew /* 1574d5d7cf4eSJonathan Chew * There should be one memnode (physical page free list(s)) for 1575a3114836SGerry Liu * each node if memory DR is disabled. 1576d5d7cf4eSJonathan Chew */ 1577d5d7cf4eSJonathan Chew max_mem_nodes = lgrp_plat_node_cnt; 1578d5d7cf4eSJonathan Chew 1579d5d7cf4eSJonathan Chew /* 1580d5d7cf4eSJonathan Chew * Initialize min and max latency before reading SLIT or probing 1581d5d7cf4eSJonathan Chew */ 1582d5d7cf4eSJonathan Chew lgrp_plat_lat_stats.latency_min = -1; 1583d5d7cf4eSJonathan Chew lgrp_plat_lat_stats.latency_max = 0; 1584d5d7cf4eSJonathan Chew 1585d5d7cf4eSJonathan Chew /* 1586d5d7cf4eSJonathan Chew * Determine how far each NUMA node is from each other by 1587d5d7cf4eSJonathan Chew * reading ACPI System Locality Information Table (SLIT) if it 1588d5d7cf4eSJonathan Chew * exists 1589d5d7cf4eSJonathan Chew */ 1590d5d7cf4eSJonathan Chew lgrp_plat_slit_error = lgrp_plat_process_slit(slit_ptr, 1591a3114836SGerry Liu lgrp_plat_node_domain, lgrp_plat_node_cnt, lgrp_plat_memnode_info, 1592d5d7cf4eSJonathan Chew &lgrp_plat_lat_stats); 1593a3114836SGerry Liu 1594a3114836SGerry Liu /* 1595a3114836SGerry Liu * Disable support of CPU/memory DR operations if multiple locality 1596a3114836SGerry Liu * domains exist in system and either of following is true. 1597a3114836SGerry Liu * 1) Failed to process SLIT table. 1598a3114836SGerry Liu * 2) Latency probing is enabled by user. 1599a3114836SGerry Liu */ 1600a3114836SGerry Liu if (lgrp_plat_node_cnt > 1 && 1601a3114836SGerry Liu (plat_dr_support_cpu() || plat_dr_support_memory())) { 1602a3114836SGerry Liu if (!lgrp_plat_slit_enable || lgrp_plat_slit_error != 0 || 1603a3114836SGerry Liu !lgrp_plat_srat_enable || lgrp_plat_srat_error != 0 || 1604a3114836SGerry Liu lgrp_plat_apic_ncpus <= 0) { 1605a3114836SGerry Liu cmn_err(CE_CONT, 1606a3114836SGerry Liu "?lgrp: failed to process ACPI SRAT/SLIT table, " 1607a3114836SGerry Liu "disable support of CPU/memory DR operations."); 1608a3114836SGerry Liu plat_dr_disable_cpu(); 1609a3114836SGerry Liu plat_dr_disable_memory(); 1610a3114836SGerry Liu } else if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) { 1611a3114836SGerry Liu cmn_err(CE_CONT, 1612a3114836SGerry Liu "?lgrp: latency probing enabled by user, " 1613a3114836SGerry Liu "disable support of CPU/memory DR operations."); 1614a3114836SGerry Liu plat_dr_disable_cpu(); 1615a3114836SGerry Liu plat_dr_disable_memory(); 1616a3114836SGerry Liu } 1617a3114836SGerry Liu } 1618a3114836SGerry Liu 1619a3114836SGerry Liu /* Done if succeeded to process SLIT table. */ 1620d5d7cf4eSJonathan Chew if (lgrp_plat_slit_error == 0) 1621d5d7cf4eSJonathan Chew return; 1622d5d7cf4eSJonathan Chew 1623d5d7cf4eSJonathan Chew /* 1624d5d7cf4eSJonathan Chew * Probe to determine latency between NUMA nodes when SLIT 1625d5d7cf4eSJonathan Chew * doesn't exist or make sense 1626d5d7cf4eSJonathan Chew */ 1627d5d7cf4eSJonathan Chew lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_ENABLE; 1628d5d7cf4eSJonathan Chew 1629d5d7cf4eSJonathan Chew /* 1630d5d7cf4eSJonathan Chew * Specify whether to probe using vendor ID register or page copy 1631d5d7cf4eSJonathan Chew * if hasn't been specified already or is overspecified 1632d5d7cf4eSJonathan Chew */ 1633d5d7cf4eSJonathan Chew probe_op = lgrp_plat_probe_flags & 1634d5d7cf4eSJonathan Chew (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1635d5d7cf4eSJonathan Chew 1636d5d7cf4eSJonathan Chew if (probe_op == 0 || 1637d5d7cf4eSJonathan Chew probe_op == (LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR)) { 1638d5d7cf4eSJonathan Chew lgrp_plat_probe_flags &= 1639d5d7cf4eSJonathan Chew ~(LGRP_PLAT_PROBE_PGCPY|LGRP_PLAT_PROBE_VENDOR); 1640d5d7cf4eSJonathan Chew if (is_opteron()) 1641d5d7cf4eSJonathan Chew lgrp_plat_probe_flags |= 1642d5d7cf4eSJonathan Chew LGRP_PLAT_PROBE_VENDOR; 1643d5d7cf4eSJonathan Chew else 1644d5d7cf4eSJonathan Chew lgrp_plat_probe_flags |= LGRP_PLAT_PROBE_PGCPY; 1645d5d7cf4eSJonathan Chew } 1646d5d7cf4eSJonathan Chew 1647d5d7cf4eSJonathan Chew /* 1648d5d7cf4eSJonathan Chew * Probing errors can mess up the lgroup topology and 1649d5d7cf4eSJonathan Chew * force us fall back to a 2 level lgroup topology. 1650d5d7cf4eSJonathan Chew * Here we bound how tall the lgroup topology can grow 1651d5d7cf4eSJonathan Chew * in hopes of avoiding any anamolies in probing from 1652d5d7cf4eSJonathan Chew * messing up the lgroup topology by limiting the 1653d5d7cf4eSJonathan Chew * accuracy of the latency topology. 1654d5d7cf4eSJonathan Chew * 1655d5d7cf4eSJonathan Chew * Assume that nodes will at least be configured in a 1656d5d7cf4eSJonathan Chew * ring, so limit height of lgroup topology to be less 1657d5d7cf4eSJonathan Chew * than number of nodes on a system with 4 or more 1658d5d7cf4eSJonathan Chew * nodes 1659d5d7cf4eSJonathan Chew */ 1660d5d7cf4eSJonathan Chew if (lgrp_plat_node_cnt >= 4 && lgrp_topo_ht_limit() == 1661d5d7cf4eSJonathan Chew lgrp_topo_ht_limit_default()) 1662d5d7cf4eSJonathan Chew (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 1663d5d7cf4eSJonathan Chew } 1664d5d7cf4eSJonathan Chew 1665d5d7cf4eSJonathan Chew 1666d5d7cf4eSJonathan Chew /* 16672e2c009bSjjc * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 16682e2c009bSjjc * be considered same 16692e2c009bSjjc */ 16702e2c009bSjjc #define LGRP_LAT_TOLERANCE_SHIFT 4 16712e2c009bSjjc 16722e2c009bSjjc int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 16732e2c009bSjjc 16742e2c009bSjjc 16752e2c009bSjjc /* 16762e2c009bSjjc * Adjust latencies between nodes to be symmetric, normalize latencies between 16772e2c009bSjjc * any nodes that are within some tolerance to be same, and make local 16782e2c009bSjjc * latencies be same 16792e2c009bSjjc */ 16802e2c009bSjjc static void 1681a3114836SGerry Liu lgrp_plat_latency_adjust(memnode_phys_addr_map_t *memnode_info, 16822e2c009bSjjc lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 16832e2c009bSjjc { 16842e2c009bSjjc int i; 16852e2c009bSjjc int j; 16862e2c009bSjjc int k; 16872e2c009bSjjc int l; 16882e2c009bSjjc u_longlong_t max; 16892e2c009bSjjc u_longlong_t min; 16902e2c009bSjjc u_longlong_t t; 16912e2c009bSjjc u_longlong_t t1; 16922e2c009bSjjc u_longlong_t t2; 16932e2c009bSjjc const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 16942e2c009bSjjc int lat_corrected[MAX_NODES][MAX_NODES]; 16952e2c009bSjjc 16962e2c009bSjjc /* 16972e2c009bSjjc * Nothing to do when this is an UMA machine or don't have args needed 16982e2c009bSjjc */ 16992e2c009bSjjc if (max_mem_nodes == 1) 17002e2c009bSjjc return; 17012e2c009bSjjc 1702a3114836SGerry Liu ASSERT(memnode_info != NULL && lat_stats != NULL && 17032e2c009bSjjc probe_stats != NULL); 17042e2c009bSjjc 17052e2c009bSjjc /* 17062e2c009bSjjc * Make sure that latencies are symmetric between any two nodes 17072e2c009bSjjc * (ie. latency(node0, node1) == latency(node1, node0)) 17082e2c009bSjjc */ 17092e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1710a3114836SGerry Liu if (!memnode_info[i].exists) 17112e2c009bSjjc continue; 17122e2c009bSjjc 17132e2c009bSjjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 1714a3114836SGerry Liu if (!memnode_info[j].exists) 17152e2c009bSjjc continue; 17162e2c009bSjjc 17172e2c009bSjjc t1 = lat_stats->latencies[i][j]; 17182e2c009bSjjc t2 = lat_stats->latencies[j][i]; 17192e2c009bSjjc 17202e2c009bSjjc if (t1 == 0 || t2 == 0 || t1 == t2) 17212e2c009bSjjc continue; 17222e2c009bSjjc 17232e2c009bSjjc /* 17242e2c009bSjjc * Latencies should be same 17252e2c009bSjjc * - Use minimum of two latencies which should be same 17262e2c009bSjjc * - Track suspect probe times not within tolerance of 17272e2c009bSjjc * min value 17282e2c009bSjjc * - Remember how much values are corrected by 17292e2c009bSjjc */ 17302e2c009bSjjc if (t1 > t2) { 17312e2c009bSjjc t = t2; 17322e2c009bSjjc probe_stats->probe_errors[i][j] += t1 - t2; 17332e2c009bSjjc if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 17342e2c009bSjjc probe_stats->probe_suspect[i][j]++; 17352e2c009bSjjc probe_stats->probe_suspect[j][i]++; 17362e2c009bSjjc } 17372e2c009bSjjc } else if (t2 > t1) { 17382e2c009bSjjc t = t1; 17392e2c009bSjjc probe_stats->probe_errors[j][i] += t2 - t1; 17402e2c009bSjjc if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 17412e2c009bSjjc probe_stats->probe_suspect[i][j]++; 17422e2c009bSjjc probe_stats->probe_suspect[j][i]++; 17432e2c009bSjjc } 17442e2c009bSjjc } 17452e2c009bSjjc 17462e2c009bSjjc lat_stats->latencies[i][j] = 17472e2c009bSjjc lat_stats->latencies[j][i] = t; 17482e2c009bSjjc lgrp_config(cflag, t1, t); 17492e2c009bSjjc lgrp_config(cflag, t2, t); 17502e2c009bSjjc } 17512e2c009bSjjc } 17522e2c009bSjjc 17532e2c009bSjjc /* 17542e2c009bSjjc * Keep track of which latencies get corrected 17552e2c009bSjjc */ 17562e2c009bSjjc for (i = 0; i < MAX_NODES; i++) 17572e2c009bSjjc for (j = 0; j < MAX_NODES; j++) 17582e2c009bSjjc lat_corrected[i][j] = 0; 17592e2c009bSjjc 17602e2c009bSjjc /* 17612e2c009bSjjc * For every two nodes, see whether there is another pair of nodes which 17622e2c009bSjjc * are about the same distance apart and make the latencies be the same 17632e2c009bSjjc * if they are close enough together 17642e2c009bSjjc */ 17652e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 17662e2c009bSjjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 1767a3114836SGerry Liu if (!memnode_info[j].exists) 17682e2c009bSjjc continue; 17692e2c009bSjjc /* 17702e2c009bSjjc * Pick one pair of nodes (i, j) 17712e2c009bSjjc * and get latency between them 17722e2c009bSjjc */ 17732e2c009bSjjc t1 = lat_stats->latencies[i][j]; 17742e2c009bSjjc 17752e2c009bSjjc /* 17762e2c009bSjjc * Skip this pair of nodes if there isn't a latency 17772e2c009bSjjc * for it yet 17782e2c009bSjjc */ 17792e2c009bSjjc if (t1 == 0) 17802e2c009bSjjc continue; 17812e2c009bSjjc 17822e2c009bSjjc for (k = 0; k < lgrp_plat_node_cnt; k++) { 17832e2c009bSjjc for (l = 0; l < lgrp_plat_node_cnt; l++) { 1784a3114836SGerry Liu if (!memnode_info[l].exists) 17852e2c009bSjjc continue; 17862e2c009bSjjc /* 17872e2c009bSjjc * Pick another pair of nodes (k, l) 17882e2c009bSjjc * not same as (i, j) and get latency 17892e2c009bSjjc * between them 17902e2c009bSjjc */ 17912e2c009bSjjc if (k == i && l == j) 17922e2c009bSjjc continue; 17932e2c009bSjjc 17942e2c009bSjjc t2 = lat_stats->latencies[k][l]; 17952e2c009bSjjc 17962e2c009bSjjc /* 17972e2c009bSjjc * Skip this pair of nodes if there 17982e2c009bSjjc * isn't a latency for it yet 17992e2c009bSjjc */ 18002e2c009bSjjc 18012e2c009bSjjc if (t2 == 0) 18022e2c009bSjjc continue; 18032e2c009bSjjc 18042e2c009bSjjc /* 18052e2c009bSjjc * Skip nodes (k, l) if they already 18062e2c009bSjjc * have same latency as (i, j) or 18072e2c009bSjjc * their latency isn't close enough to 18082e2c009bSjjc * be considered/made the same 18092e2c009bSjjc */ 18102e2c009bSjjc if (t1 == t2 || (t1 > t2 && t1 - t2 > 18112e2c009bSjjc t1 >> lgrp_plat_probe_lt_shift) || 18122e2c009bSjjc (t2 > t1 && t2 - t1 > 18132e2c009bSjjc t2 >> lgrp_plat_probe_lt_shift)) 18142e2c009bSjjc continue; 18152e2c009bSjjc 18162e2c009bSjjc /* 18172e2c009bSjjc * Make latency(i, j) same as 18182e2c009bSjjc * latency(k, l), try to use latency 18192e2c009bSjjc * that has been adjusted already to get 18202e2c009bSjjc * more consistency (if possible), and 18212e2c009bSjjc * remember which latencies were 18222e2c009bSjjc * adjusted for next time 18232e2c009bSjjc */ 18242e2c009bSjjc if (lat_corrected[i][j]) { 18252e2c009bSjjc t = t1; 18262e2c009bSjjc lgrp_config(cflag, t2, t); 18272e2c009bSjjc t2 = t; 18282e2c009bSjjc } else if (lat_corrected[k][l]) { 18292e2c009bSjjc t = t2; 18302e2c009bSjjc lgrp_config(cflag, t1, t); 18312e2c009bSjjc t1 = t; 18322e2c009bSjjc } else { 18332e2c009bSjjc if (t1 > t2) 18342e2c009bSjjc t = t2; 18352e2c009bSjjc else 18362e2c009bSjjc t = t1; 18372e2c009bSjjc lgrp_config(cflag, t1, t); 18382e2c009bSjjc lgrp_config(cflag, t2, t); 18392e2c009bSjjc t1 = t2 = t; 18402e2c009bSjjc } 18412e2c009bSjjc 18422e2c009bSjjc lat_stats->latencies[i][j] = 18432e2c009bSjjc lat_stats->latencies[k][l] = t; 18442e2c009bSjjc 18452e2c009bSjjc lat_corrected[i][j] = 18462e2c009bSjjc lat_corrected[k][l] = 1; 18472e2c009bSjjc } 18482e2c009bSjjc } 18492e2c009bSjjc } 18502e2c009bSjjc } 18512e2c009bSjjc 18522e2c009bSjjc /* 18532e2c009bSjjc * Local latencies should be same 18542e2c009bSjjc * - Find min and max local latencies 18552e2c009bSjjc * - Make all local latencies be minimum 18562e2c009bSjjc */ 18572e2c009bSjjc min = -1; 18582e2c009bSjjc max = 0; 18592e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1860a3114836SGerry Liu if (!memnode_info[i].exists) 18612e2c009bSjjc continue; 18622e2c009bSjjc t = lat_stats->latencies[i][i]; 18632e2c009bSjjc if (t == 0) 18642e2c009bSjjc continue; 18652e2c009bSjjc if (min == -1 || t < min) 18662e2c009bSjjc min = t; 18672e2c009bSjjc if (t > max) 18682e2c009bSjjc max = t; 18692e2c009bSjjc } 18702e2c009bSjjc if (min != max) { 18712e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 18722e2c009bSjjc int local; 18732e2c009bSjjc 1874a3114836SGerry Liu if (!memnode_info[i].exists) 18752e2c009bSjjc continue; 18762e2c009bSjjc 18772e2c009bSjjc local = lat_stats->latencies[i][i]; 18782e2c009bSjjc if (local == 0) 18792e2c009bSjjc continue; 18802e2c009bSjjc 18812e2c009bSjjc /* 18822e2c009bSjjc * Track suspect probe times that aren't within 18832e2c009bSjjc * tolerance of minimum local latency and how much 18842e2c009bSjjc * probe times are corrected by 18852e2c009bSjjc */ 18862e2c009bSjjc if (local - min > min >> lgrp_plat_probe_lt_shift) 18872e2c009bSjjc probe_stats->probe_suspect[i][i]++; 18882e2c009bSjjc 18892e2c009bSjjc probe_stats->probe_errors[i][i] += local - min; 18902e2c009bSjjc 18912e2c009bSjjc /* 18922e2c009bSjjc * Make local latencies be minimum 18932e2c009bSjjc */ 18942e2c009bSjjc lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 18952e2c009bSjjc lat_stats->latencies[i][i] = min; 18962e2c009bSjjc } 18972e2c009bSjjc } 18982e2c009bSjjc 18992e2c009bSjjc /* 19002e2c009bSjjc * Determine max probe time again since just adjusted latencies 19012e2c009bSjjc */ 19022e2c009bSjjc lat_stats->latency_max = 0; 19032e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 19042e2c009bSjjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 1905a3114836SGerry Liu if (!memnode_info[j].exists) 19062e2c009bSjjc continue; 19072e2c009bSjjc t = lat_stats->latencies[i][j]; 19082e2c009bSjjc if (t > lat_stats->latency_max) 19092e2c009bSjjc lat_stats->latency_max = t; 19102e2c009bSjjc } 19112e2c009bSjjc } 19122e2c009bSjjc } 19132e2c009bSjjc 19142e2c009bSjjc 19152e2c009bSjjc /* 19162e2c009bSjjc * Verify following about latencies between nodes: 19172e2c009bSjjc * 19182e2c009bSjjc * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 19192e2c009bSjjc * - Local latencies same 19202e2c009bSjjc * - Local < remote 19212e2c009bSjjc * - Number of latencies seen is reasonable 19222e2c009bSjjc * - Number of occurrences of a given latency should be more than 1 19232e2c009bSjjc * 19242e2c009bSjjc * Returns: 19252e2c009bSjjc * 0 Success 19262e2c009bSjjc * -1 Not symmetric 19272e2c009bSjjc * -2 Local latencies not same 19282e2c009bSjjc * -3 Local >= remote 19292e2c009bSjjc */ 19302e2c009bSjjc static int 1931a3114836SGerry Liu lgrp_plat_latency_verify(memnode_phys_addr_map_t *memnode_info, 19322e2c009bSjjc lgrp_plat_latency_stats_t *lat_stats) 19332e2c009bSjjc { 19342e2c009bSjjc int i; 19352e2c009bSjjc int j; 19362e2c009bSjjc u_longlong_t t1; 19372e2c009bSjjc u_longlong_t t2; 19382e2c009bSjjc 1939a3114836SGerry Liu ASSERT(memnode_info != NULL && lat_stats != NULL); 19402e2c009bSjjc 19412e2c009bSjjc /* 19422e2c009bSjjc * Nothing to do when this is an UMA machine, lgroup topology is 19432e2c009bSjjc * limited to 2 levels, or there aren't any probe times yet 19442e2c009bSjjc */ 19452e2c009bSjjc if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 19462e2c009bSjjc lat_stats->latencies[0][0] == 0) 19472e2c009bSjjc return (0); 19482e2c009bSjjc 19492e2c009bSjjc /* 19502e2c009bSjjc * Make sure that latencies are symmetric between any two nodes 19512e2c009bSjjc * (ie. latency(node0, node1) == latency(node1, node0)) 19522e2c009bSjjc */ 19532e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 1954a3114836SGerry Liu if (!memnode_info[i].exists) 19552e2c009bSjjc continue; 19562e2c009bSjjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 1957a3114836SGerry Liu if (!memnode_info[j].exists) 19582e2c009bSjjc continue; 19592e2c009bSjjc t1 = lat_stats->latencies[i][j]; 19602e2c009bSjjc t2 = lat_stats->latencies[j][i]; 19612e2c009bSjjc 19622e2c009bSjjc if (t1 == 0 || t2 == 0 || t1 == t2) 19632e2c009bSjjc continue; 19642e2c009bSjjc 19652e2c009bSjjc return (-1); 19662e2c009bSjjc } 19672e2c009bSjjc } 19682e2c009bSjjc 19692e2c009bSjjc /* 19702e2c009bSjjc * Local latencies should be same 19712e2c009bSjjc */ 19722e2c009bSjjc t1 = lat_stats->latencies[0][0]; 19732e2c009bSjjc for (i = 1; i < lgrp_plat_node_cnt; i++) { 1974a3114836SGerry Liu if (!memnode_info[i].exists) 19752e2c009bSjjc continue; 19762e2c009bSjjc 19772e2c009bSjjc t2 = lat_stats->latencies[i][i]; 19782e2c009bSjjc if (t2 == 0) 19792e2c009bSjjc continue; 19802e2c009bSjjc 19812e2c009bSjjc if (t1 == 0) { 19822e2c009bSjjc t1 = t2; 19832e2c009bSjjc continue; 19842e2c009bSjjc } 19852e2c009bSjjc 19862e2c009bSjjc if (t1 != t2) 19872e2c009bSjjc return (-2); 19882e2c009bSjjc } 19892e2c009bSjjc 19902e2c009bSjjc /* 19912e2c009bSjjc * Local latencies should be less than remote 19922e2c009bSjjc */ 19932e2c009bSjjc if (t1) { 19942e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 19952e2c009bSjjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 1996a3114836SGerry Liu if (!memnode_info[j].exists) 19972e2c009bSjjc continue; 19982e2c009bSjjc t2 = lat_stats->latencies[i][j]; 19992e2c009bSjjc if (i == j || t2 == 0) 20002e2c009bSjjc continue; 20012e2c009bSjjc 20022e2c009bSjjc if (t1 >= t2) 20032e2c009bSjjc return (-3); 20042e2c009bSjjc } 20052e2c009bSjjc } 20062e2c009bSjjc } 20072e2c009bSjjc 20082e2c009bSjjc return (0); 20092e2c009bSjjc } 20102e2c009bSjjc 20112e2c009bSjjc 20122e2c009bSjjc /* 2013d5d7cf4eSJonathan Chew * Platform-specific initialization 2014d5d7cf4eSJonathan Chew */ 2015d5d7cf4eSJonathan Chew static void 2016d5d7cf4eSJonathan Chew lgrp_plat_main_init(void) 2017d5d7cf4eSJonathan Chew { 2018d5d7cf4eSJonathan Chew int curnode; 2019d5d7cf4eSJonathan Chew int ht_limit; 2020d5d7cf4eSJonathan Chew int i; 2021d5d7cf4eSJonathan Chew 2022d5d7cf4eSJonathan Chew /* 2023d5d7cf4eSJonathan Chew * Print a notice that MPO is disabled when memory is interleaved 2024d5d7cf4eSJonathan Chew * across nodes....Would do this when it is discovered, but can't 2025d5d7cf4eSJonathan Chew * because it happens way too early during boot.... 2026d5d7cf4eSJonathan Chew */ 2027d5d7cf4eSJonathan Chew if (lgrp_plat_mem_intrlv) 2028d5d7cf4eSJonathan Chew cmn_err(CE_NOTE, 2029d5d7cf4eSJonathan Chew "MPO disabled because memory is interleaved\n"); 2030d5d7cf4eSJonathan Chew 2031d5d7cf4eSJonathan Chew /* 2032d5d7cf4eSJonathan Chew * Don't bother to do any probing if it is disabled, there is only one 2033d5d7cf4eSJonathan Chew * node, or the height of the lgroup topology less than or equal to 2 2034d5d7cf4eSJonathan Chew */ 2035d5d7cf4eSJonathan Chew ht_limit = lgrp_topo_ht_limit(); 2036d5d7cf4eSJonathan Chew if (!(lgrp_plat_probe_flags & LGRP_PLAT_PROBE_ENABLE) || 2037d5d7cf4eSJonathan Chew max_mem_nodes == 1 || ht_limit <= 2) { 2038d5d7cf4eSJonathan Chew /* 2039d5d7cf4eSJonathan Chew * Setup lgroup latencies for 2 level lgroup topology 2040d5d7cf4eSJonathan Chew * (ie. local and remote only) if they haven't been set yet 2041d5d7cf4eSJonathan Chew */ 2042d5d7cf4eSJonathan Chew if (ht_limit == 2 && lgrp_plat_lat_stats.latency_min == -1 && 2043d5d7cf4eSJonathan Chew lgrp_plat_lat_stats.latency_max == 0) 2044a3114836SGerry Liu lgrp_plat_2level_setup(&lgrp_plat_lat_stats); 2045d5d7cf4eSJonathan Chew return; 2046d5d7cf4eSJonathan Chew } 2047d5d7cf4eSJonathan Chew 2048d5d7cf4eSJonathan Chew if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 2049d5d7cf4eSJonathan Chew /* 2050d5d7cf4eSJonathan Chew * Should have been able to probe from CPU 0 when it was added 2051d5d7cf4eSJonathan Chew * to lgroup hierarchy, but may not have been able to then 2052d5d7cf4eSJonathan Chew * because it happens so early in boot that gethrtime() hasn't 2053d5d7cf4eSJonathan Chew * been initialized. (:-( 2054d5d7cf4eSJonathan Chew */ 2055d5d7cf4eSJonathan Chew curnode = lgrp_plat_cpu_to_node(CPU, lgrp_plat_cpu_node, 2056d5d7cf4eSJonathan Chew lgrp_plat_cpu_node_nentries); 2057d5d7cf4eSJonathan Chew ASSERT(curnode >= 0 && curnode < lgrp_plat_node_cnt); 2058d5d7cf4eSJonathan Chew if (lgrp_plat_lat_stats.latencies[curnode][curnode] == 0) 2059d5d7cf4eSJonathan Chew lgrp_plat_probe(); 2060d5d7cf4eSJonathan Chew 2061d5d7cf4eSJonathan Chew return; 2062d5d7cf4eSJonathan Chew } 2063d5d7cf4eSJonathan Chew 2064d5d7cf4eSJonathan Chew /* 2065d5d7cf4eSJonathan Chew * When probing memory, use one page for every sample to determine 2066d5d7cf4eSJonathan Chew * lgroup topology and taking multiple samples 2067d5d7cf4eSJonathan Chew */ 2068d5d7cf4eSJonathan Chew if (lgrp_plat_probe_mem_config.probe_memsize == 0) 2069d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_memsize = PAGESIZE * 2070d5d7cf4eSJonathan Chew lgrp_plat_probe_nsamples; 2071d5d7cf4eSJonathan Chew 2072d5d7cf4eSJonathan Chew /* 2073d5d7cf4eSJonathan Chew * Map memory in each node needed for probing to determine latency 2074d5d7cf4eSJonathan Chew * topology 2075d5d7cf4eSJonathan Chew */ 2076d5d7cf4eSJonathan Chew for (i = 0; i < lgrp_plat_node_cnt; i++) { 2077d5d7cf4eSJonathan Chew int mnode; 2078d5d7cf4eSJonathan Chew 2079d5d7cf4eSJonathan Chew /* 2080d5d7cf4eSJonathan Chew * Skip this node and leave its probe page NULL 2081d5d7cf4eSJonathan Chew * if it doesn't have any memory 2082d5d7cf4eSJonathan Chew */ 2083a3114836SGerry Liu mnode = i; 2084d5d7cf4eSJonathan Chew if (!mem_node_config[mnode].exists) { 2085d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_va[i] = NULL; 2086d5d7cf4eSJonathan Chew continue; 2087d5d7cf4eSJonathan Chew } 2088d5d7cf4eSJonathan Chew 2089d5d7cf4eSJonathan Chew /* 2090d5d7cf4eSJonathan Chew * Allocate one kernel virtual page 2091d5d7cf4eSJonathan Chew */ 2092d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_va[i] = vmem_alloc(heap_arena, 2093d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_memsize, VM_NOSLEEP); 2094d5d7cf4eSJonathan Chew if (lgrp_plat_probe_mem_config.probe_va[i] == NULL) { 2095d5d7cf4eSJonathan Chew cmn_err(CE_WARN, 2096d5d7cf4eSJonathan Chew "lgrp_plat_main_init: couldn't allocate memory"); 2097d5d7cf4eSJonathan Chew return; 2098d5d7cf4eSJonathan Chew } 2099d5d7cf4eSJonathan Chew 2100d5d7cf4eSJonathan Chew /* 2101d5d7cf4eSJonathan Chew * Get PFN for first page in each node 2102d5d7cf4eSJonathan Chew */ 2103d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_pfn[i] = 2104d5d7cf4eSJonathan Chew mem_node_config[mnode].physbase; 2105d5d7cf4eSJonathan Chew 2106d5d7cf4eSJonathan Chew /* 2107d5d7cf4eSJonathan Chew * Map virtual page to first page in node 2108d5d7cf4eSJonathan Chew */ 2109d5d7cf4eSJonathan Chew hat_devload(kas.a_hat, lgrp_plat_probe_mem_config.probe_va[i], 2110d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_memsize, 2111d5d7cf4eSJonathan Chew lgrp_plat_probe_mem_config.probe_pfn[i], 2112d5d7cf4eSJonathan Chew PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 2113d5d7cf4eSJonathan Chew HAT_LOAD_NOCONSIST); 2114d5d7cf4eSJonathan Chew } 2115d5d7cf4eSJonathan Chew 2116d5d7cf4eSJonathan Chew /* 2117d5d7cf4eSJonathan Chew * Probe from current CPU 2118d5d7cf4eSJonathan Chew */ 2119d5d7cf4eSJonathan Chew lgrp_plat_probe(); 2120d5d7cf4eSJonathan Chew } 2121d5d7cf4eSJonathan Chew 2122d5d7cf4eSJonathan Chew 2123d5d7cf4eSJonathan Chew /* 21242e2c009bSjjc * Return the number of free, allocatable, or installed 21252e2c009bSjjc * pages in an lgroup 21262e2c009bSjjc * This is a copy of the MAX_MEM_NODES == 1 version of the routine 21272e2c009bSjjc * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 21282e2c009bSjjc */ 21292e2c009bSjjc static pgcnt_t 21302e2c009bSjjc lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 21312e2c009bSjjc { 2132a3114836SGerry Liu _NOTE(ARGUNUSED(lgrphand)); 2133a3114836SGerry Liu 21342e2c009bSjjc struct memlist *mlist; 21352e2c009bSjjc pgcnt_t npgs = 0; 21362e2c009bSjjc extern struct memlist *phys_avail; 21372e2c009bSjjc extern struct memlist *phys_install; 21382e2c009bSjjc 21392e2c009bSjjc switch (query) { 21402e2c009bSjjc case LGRP_MEM_SIZE_FREE: 21412e2c009bSjjc return ((pgcnt_t)freemem); 21422e2c009bSjjc case LGRP_MEM_SIZE_AVAIL: 21432e2c009bSjjc memlist_read_lock(); 214456f33205SJonathan Adams for (mlist = phys_avail; mlist; mlist = mlist->ml_next) 214556f33205SJonathan Adams npgs += btop(mlist->ml_size); 21462e2c009bSjjc memlist_read_unlock(); 21472e2c009bSjjc return (npgs); 21482e2c009bSjjc case LGRP_MEM_SIZE_INSTALL: 21492e2c009bSjjc memlist_read_lock(); 215056f33205SJonathan Adams for (mlist = phys_install; mlist; mlist = mlist->ml_next) 215156f33205SJonathan Adams npgs += btop(mlist->ml_size); 21522e2c009bSjjc memlist_read_unlock(); 21532e2c009bSjjc return (npgs); 21542e2c009bSjjc default: 21552e2c009bSjjc return ((pgcnt_t)0); 21562e2c009bSjjc } 21572e2c009bSjjc } 21582e2c009bSjjc 21592e2c009bSjjc 21602e2c009bSjjc /* 21612e2c009bSjjc * Update node to proximity domain mappings for given domain and return node ID 21622e2c009bSjjc */ 21632e2c009bSjjc static int 2164d821f0f0Sjjc lgrp_plat_node_domain_update(node_domain_map_t *node_domain, int node_cnt, 2165d821f0f0Sjjc uint32_t domain) 21662e2c009bSjjc { 21672e2c009bSjjc uint_t node; 21682e2c009bSjjc uint_t start; 21692e2c009bSjjc 21702e2c009bSjjc /* 21712e2c009bSjjc * Hash proximity domain ID into node to domain mapping table (array) 21722e2c009bSjjc * and add entry for it into first non-existent or matching entry found 21732e2c009bSjjc */ 2174d821f0f0Sjjc node = start = NODE_DOMAIN_HASH(domain, node_cnt); 21752e2c009bSjjc do { 21762e2c009bSjjc /* 21772e2c009bSjjc * Entry doesn't exist yet, so create one for this proximity 21782e2c009bSjjc * domain and return node ID which is index into mapping table. 21792e2c009bSjjc */ 21802e2c009bSjjc if (!node_domain[node].exists) { 21812e2c009bSjjc node_domain[node].prox_domain = domain; 2182a3114836SGerry Liu membar_producer(); 2183a3114836SGerry Liu node_domain[node].exists = 1; 21842e2c009bSjjc return (node); 21852e2c009bSjjc } 21862e2c009bSjjc 21872e2c009bSjjc /* 21882e2c009bSjjc * Entry exists for this proximity domain already, so just 21892e2c009bSjjc * return node ID (index into table). 21902e2c009bSjjc */ 21912e2c009bSjjc if (node_domain[node].prox_domain == domain) 21922e2c009bSjjc return (node); 2193d821f0f0Sjjc node = NODE_DOMAIN_HASH(node + 1, node_cnt); 21942e2c009bSjjc } while (node != start); 21952e2c009bSjjc 21962e2c009bSjjc /* 21972e2c009bSjjc * Ran out of supported number of entries which shouldn't happen.... 21982e2c009bSjjc */ 21992e2c009bSjjc ASSERT(node != start); 22002e2c009bSjjc return (-1); 22012e2c009bSjjc } 22022e2c009bSjjc 22032e2c009bSjjc /* 22042e2c009bSjjc * Update node memory information for given proximity domain with specified 22052e2c009bSjjc * starting and ending physical address range (and return positive numbers for 22062e2c009bSjjc * success and negative ones for errors) 22072e2c009bSjjc */ 22082e2c009bSjjc static int 2209a3114836SGerry Liu lgrp_plat_memnode_info_update(node_domain_map_t *node_domain, int node_cnt, 2210a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info, int memnode_cnt, uint64_t start, 2211a3114836SGerry Liu uint64_t end, uint32_t domain, uint32_t device_id) 22122e2c009bSjjc { 2213a3114836SGerry Liu int node, mnode; 22142e2c009bSjjc 22152e2c009bSjjc /* 22162e2c009bSjjc * Get node number for proximity domain 22172e2c009bSjjc */ 2218d821f0f0Sjjc node = lgrp_plat_domain_to_node(node_domain, node_cnt, domain); 22192e2c009bSjjc if (node == -1) { 2220d821f0f0Sjjc node = lgrp_plat_node_domain_update(node_domain, node_cnt, 2221d821f0f0Sjjc domain); 22222e2c009bSjjc if (node == -1) 22232e2c009bSjjc return (-1); 22242e2c009bSjjc } 22252e2c009bSjjc 22262e2c009bSjjc /* 2227a3114836SGerry Liu * This function is called during boot if device_id is 2228a3114836SGerry Liu * ACPI_MEMNODE_DEVID_BOOT, otherwise it's called at runtime for 2229a3114836SGerry Liu * memory DR operations. 2230a3114836SGerry Liu */ 2231a3114836SGerry Liu if (device_id != ACPI_MEMNODE_DEVID_BOOT) { 2232a3114836SGerry Liu ASSERT(lgrp_plat_max_mem_node <= memnode_cnt); 2233a3114836SGerry Liu 2234a3114836SGerry Liu for (mnode = lgrp_plat_node_cnt; 2235a3114836SGerry Liu mnode < lgrp_plat_max_mem_node; mnode++) { 2236a3114836SGerry Liu if (memnode_info[mnode].exists && 2237a3114836SGerry Liu memnode_info[mnode].prox_domain == domain && 2238a3114836SGerry Liu memnode_info[mnode].device_id == device_id) { 2239a3114836SGerry Liu if (btop(start) < memnode_info[mnode].start) 2240a3114836SGerry Liu memnode_info[mnode].start = btop(start); 2241a3114836SGerry Liu if (btop(end) > memnode_info[mnode].end) 2242a3114836SGerry Liu memnode_info[mnode].end = btop(end); 2243a3114836SGerry Liu return (1); 2244a3114836SGerry Liu } 2245a3114836SGerry Liu } 2246a3114836SGerry Liu 2247a3114836SGerry Liu if (lgrp_plat_max_mem_node >= memnode_cnt) { 2248a3114836SGerry Liu return (-3); 2249a3114836SGerry Liu } else { 2250a3114836SGerry Liu lgrp_plat_max_mem_node++; 2251a3114836SGerry Liu memnode_info[mnode].start = btop(start); 2252a3114836SGerry Liu memnode_info[mnode].end = btop(end); 2253a3114836SGerry Liu memnode_info[mnode].prox_domain = domain; 2254a3114836SGerry Liu memnode_info[mnode].device_id = device_id; 2255a3114836SGerry Liu memnode_info[mnode].lgrphand = node; 2256a3114836SGerry Liu membar_producer(); 2257a3114836SGerry Liu memnode_info[mnode].exists = 1; 2258a3114836SGerry Liu return (0); 2259a3114836SGerry Liu } 2260a3114836SGerry Liu } 2261a3114836SGerry Liu 2262a3114836SGerry Liu /* 22632e2c009bSjjc * Create entry in table for node if it doesn't exist 22642e2c009bSjjc */ 2265a3114836SGerry Liu ASSERT(node < memnode_cnt); 2266a3114836SGerry Liu if (!memnode_info[node].exists) { 2267a3114836SGerry Liu memnode_info[node].start = btop(start); 2268a3114836SGerry Liu memnode_info[node].end = btop(end); 2269a3114836SGerry Liu memnode_info[node].prox_domain = domain; 2270a3114836SGerry Liu memnode_info[node].device_id = device_id; 2271a3114836SGerry Liu memnode_info[node].lgrphand = node; 2272a3114836SGerry Liu membar_producer(); 2273a3114836SGerry Liu memnode_info[node].exists = 1; 22742e2c009bSjjc return (0); 22752e2c009bSjjc } 22762e2c009bSjjc 22772e2c009bSjjc /* 22782e2c009bSjjc * Entry already exists for this proximity domain 22792e2c009bSjjc * 22802e2c009bSjjc * There may be more than one SRAT memory entry for a domain, so we may 22812e2c009bSjjc * need to update existing start or end address for the node. 22822e2c009bSjjc */ 2283a3114836SGerry Liu if (memnode_info[node].prox_domain == domain) { 2284a3114836SGerry Liu if (btop(start) < memnode_info[node].start) 2285a3114836SGerry Liu memnode_info[node].start = btop(start); 2286a3114836SGerry Liu if (btop(end) > memnode_info[node].end) 2287a3114836SGerry Liu memnode_info[node].end = btop(end); 22882e2c009bSjjc return (1); 22892e2c009bSjjc } 22902e2c009bSjjc return (-2); 22912e2c009bSjjc } 22922e2c009bSjjc 22932e2c009bSjjc 22942e2c009bSjjc /* 2295a3114836SGerry Liu * Have to sort nodes by starting physical address because plat_mnode_xcheck() 2296a3114836SGerry Liu * assumes and expects memnodes to be sorted in ascending order by physical 2297a3114836SGerry Liu * address. 229881d9ccb6SJonathan Chew */ 229981d9ccb6SJonathan Chew static void 230081d9ccb6SJonathan Chew lgrp_plat_node_sort(node_domain_map_t *node_domain, int node_cnt, 2301a3114836SGerry Liu cpu_node_map_t *cpu_node, int cpu_count, 2302a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info) 230381d9ccb6SJonathan Chew { 230481d9ccb6SJonathan Chew boolean_t found; 230581d9ccb6SJonathan Chew int i; 230681d9ccb6SJonathan Chew int j; 230781d9ccb6SJonathan Chew int n; 230881d9ccb6SJonathan Chew boolean_t sorted; 230981d9ccb6SJonathan Chew boolean_t swapped; 231081d9ccb6SJonathan Chew 231181d9ccb6SJonathan Chew if (!lgrp_plat_node_sort_enable || node_cnt <= 1 || 2312a3114836SGerry Liu node_domain == NULL || memnode_info == NULL) 231381d9ccb6SJonathan Chew return; 231481d9ccb6SJonathan Chew 231581d9ccb6SJonathan Chew /* 231681d9ccb6SJonathan Chew * Sorted already? 231781d9ccb6SJonathan Chew */ 231881d9ccb6SJonathan Chew sorted = B_TRUE; 231981d9ccb6SJonathan Chew for (i = 0; i < node_cnt - 1; i++) { 232081d9ccb6SJonathan Chew /* 232181d9ccb6SJonathan Chew * Skip entries that don't exist 232281d9ccb6SJonathan Chew */ 2323a3114836SGerry Liu if (!memnode_info[i].exists) 232481d9ccb6SJonathan Chew continue; 232581d9ccb6SJonathan Chew 232681d9ccb6SJonathan Chew /* 232781d9ccb6SJonathan Chew * Try to find next existing entry to compare against 232881d9ccb6SJonathan Chew */ 232981d9ccb6SJonathan Chew found = B_FALSE; 233081d9ccb6SJonathan Chew for (j = i + 1; j < node_cnt; j++) { 2331a3114836SGerry Liu if (memnode_info[j].exists) { 233281d9ccb6SJonathan Chew found = B_TRUE; 233381d9ccb6SJonathan Chew break; 233481d9ccb6SJonathan Chew } 233581d9ccb6SJonathan Chew } 233681d9ccb6SJonathan Chew 233781d9ccb6SJonathan Chew /* 233881d9ccb6SJonathan Chew * Done if no more existing entries to compare against 233981d9ccb6SJonathan Chew */ 234081d9ccb6SJonathan Chew if (found == B_FALSE) 234181d9ccb6SJonathan Chew break; 234281d9ccb6SJonathan Chew 234381d9ccb6SJonathan Chew /* 234481d9ccb6SJonathan Chew * Not sorted if starting address of current entry is bigger 234581d9ccb6SJonathan Chew * than starting address of next existing entry 234681d9ccb6SJonathan Chew */ 2347a3114836SGerry Liu if (memnode_info[i].start > memnode_info[j].start) { 234881d9ccb6SJonathan Chew sorted = B_FALSE; 234981d9ccb6SJonathan Chew break; 235081d9ccb6SJonathan Chew } 235181d9ccb6SJonathan Chew } 235281d9ccb6SJonathan Chew 235381d9ccb6SJonathan Chew /* 235481d9ccb6SJonathan Chew * Don't need to sort if sorted already 235581d9ccb6SJonathan Chew */ 235681d9ccb6SJonathan Chew if (sorted == B_TRUE) 235781d9ccb6SJonathan Chew return; 235881d9ccb6SJonathan Chew 235981d9ccb6SJonathan Chew /* 236081d9ccb6SJonathan Chew * Just use bubble sort since number of nodes is small 236181d9ccb6SJonathan Chew */ 236281d9ccb6SJonathan Chew n = node_cnt; 236381d9ccb6SJonathan Chew do { 236481d9ccb6SJonathan Chew swapped = B_FALSE; 236581d9ccb6SJonathan Chew n--; 236681d9ccb6SJonathan Chew for (i = 0; i < n; i++) { 236781d9ccb6SJonathan Chew /* 236881d9ccb6SJonathan Chew * Skip entries that don't exist 236981d9ccb6SJonathan Chew */ 2370a3114836SGerry Liu if (!memnode_info[i].exists) 237181d9ccb6SJonathan Chew continue; 237281d9ccb6SJonathan Chew 237381d9ccb6SJonathan Chew /* 237481d9ccb6SJonathan Chew * Try to find next existing entry to compare against 237581d9ccb6SJonathan Chew */ 237681d9ccb6SJonathan Chew found = B_FALSE; 237781d9ccb6SJonathan Chew for (j = i + 1; j <= n; j++) { 2378a3114836SGerry Liu if (memnode_info[j].exists) { 237981d9ccb6SJonathan Chew found = B_TRUE; 238081d9ccb6SJonathan Chew break; 238181d9ccb6SJonathan Chew } 238281d9ccb6SJonathan Chew } 238381d9ccb6SJonathan Chew 238481d9ccb6SJonathan Chew /* 238581d9ccb6SJonathan Chew * Done if no more existing entries to compare against 238681d9ccb6SJonathan Chew */ 238781d9ccb6SJonathan Chew if (found == B_FALSE) 238881d9ccb6SJonathan Chew break; 238981d9ccb6SJonathan Chew 2390a3114836SGerry Liu if (memnode_info[i].start > memnode_info[j].start) { 2391a3114836SGerry Liu memnode_phys_addr_map_t save_addr; 239281d9ccb6SJonathan Chew node_domain_map_t save_node; 239381d9ccb6SJonathan Chew 239481d9ccb6SJonathan Chew /* 239581d9ccb6SJonathan Chew * Swap node to proxmity domain ID assignments 239681d9ccb6SJonathan Chew */ 239781d9ccb6SJonathan Chew bcopy(&node_domain[i], &save_node, 239881d9ccb6SJonathan Chew sizeof (node_domain_map_t)); 239981d9ccb6SJonathan Chew bcopy(&node_domain[j], &node_domain[i], 240081d9ccb6SJonathan Chew sizeof (node_domain_map_t)); 240181d9ccb6SJonathan Chew bcopy(&save_node, &node_domain[j], 240281d9ccb6SJonathan Chew sizeof (node_domain_map_t)); 240381d9ccb6SJonathan Chew 240481d9ccb6SJonathan Chew /* 240581d9ccb6SJonathan Chew * Swap node to physical memory assignments 240681d9ccb6SJonathan Chew */ 2407a3114836SGerry Liu bcopy(&memnode_info[i], &save_addr, 2408a3114836SGerry Liu sizeof (memnode_phys_addr_map_t)); 2409a3114836SGerry Liu bcopy(&memnode_info[j], &memnode_info[i], 2410a3114836SGerry Liu sizeof (memnode_phys_addr_map_t)); 2411a3114836SGerry Liu bcopy(&save_addr, &memnode_info[j], 2412a3114836SGerry Liu sizeof (memnode_phys_addr_map_t)); 241381d9ccb6SJonathan Chew swapped = B_TRUE; 241481d9ccb6SJonathan Chew } 241581d9ccb6SJonathan Chew } 241681d9ccb6SJonathan Chew } while (swapped == B_TRUE); 241781d9ccb6SJonathan Chew 241881d9ccb6SJonathan Chew /* 241981d9ccb6SJonathan Chew * Check to make sure that CPUs assigned to correct node IDs now since 242081d9ccb6SJonathan Chew * node to proximity domain ID assignments may have been changed above 242181d9ccb6SJonathan Chew */ 242281d9ccb6SJonathan Chew if (n == node_cnt - 1 || cpu_node == NULL || cpu_count < 1) 242381d9ccb6SJonathan Chew return; 242481d9ccb6SJonathan Chew for (i = 0; i < cpu_count; i++) { 242581d9ccb6SJonathan Chew int node; 242681d9ccb6SJonathan Chew 242781d9ccb6SJonathan Chew node = lgrp_plat_domain_to_node(node_domain, node_cnt, 242881d9ccb6SJonathan Chew cpu_node[i].prox_domain); 242981d9ccb6SJonathan Chew if (cpu_node[i].node != node) 243081d9ccb6SJonathan Chew cpu_node[i].node = node; 243181d9ccb6SJonathan Chew } 243281d9ccb6SJonathan Chew 243381d9ccb6SJonathan Chew } 243481d9ccb6SJonathan Chew 243581d9ccb6SJonathan Chew 243681d9ccb6SJonathan Chew /* 24372e2c009bSjjc * Return time needed to probe from current CPU to memory in given node 24382e2c009bSjjc */ 24392e2c009bSjjc static hrtime_t 2440d5d7cf4eSJonathan Chew lgrp_plat_probe_time(int to, cpu_node_map_t *cpu_node, int cpu_node_nentries, 24412e2c009bSjjc lgrp_plat_probe_mem_config_t *probe_mem_config, 24422e2c009bSjjc lgrp_plat_latency_stats_t *lat_stats, lgrp_plat_probe_stats_t *probe_stats) 24432e2c009bSjjc { 24442e2c009bSjjc caddr_t buf; 24452e2c009bSjjc hrtime_t elapsed; 24462e2c009bSjjc hrtime_t end; 24472e2c009bSjjc int from; 24482e2c009bSjjc int i; 24492e2c009bSjjc int ipl; 24502e2c009bSjjc hrtime_t max; 24512e2c009bSjjc hrtime_t min; 24522e2c009bSjjc hrtime_t start; 24532e2c009bSjjc extern int use_sse_pagecopy; 24542e2c009bSjjc 24552e2c009bSjjc /* 24562e2c009bSjjc * Determine ID of node containing current CPU 24572e2c009bSjjc */ 2458d5d7cf4eSJonathan Chew from = lgrp_plat_cpu_to_node(CPU, cpu_node, cpu_node_nentries); 24592e2c009bSjjc ASSERT(from >= 0 && from < lgrp_plat_node_cnt); 24602e2c009bSjjc 24612e2c009bSjjc /* 24622e2c009bSjjc * Do common work for probing main memory 24632e2c009bSjjc */ 24642e2c009bSjjc if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_PGCPY) { 24652e2c009bSjjc /* 24662e2c009bSjjc * Skip probing any nodes without memory and 24672e2c009bSjjc * set probe time to 0 24682e2c009bSjjc */ 24692e2c009bSjjc if (probe_mem_config->probe_va[to] == NULL) { 24702e2c009bSjjc lat_stats->latencies[from][to] = 0; 24712e2c009bSjjc return (0); 24722e2c009bSjjc } 24732e2c009bSjjc 24742e2c009bSjjc /* 24752e2c009bSjjc * Invalidate caches once instead of once every sample 24762e2c009bSjjc * which should cut cost of probing by a lot 24772e2c009bSjjc */ 24782e2c009bSjjc probe_stats->flush_cost = gethrtime(); 24792e2c009bSjjc invalidate_cache(); 24802e2c009bSjjc probe_stats->flush_cost = gethrtime() - 24812e2c009bSjjc probe_stats->flush_cost; 24822e2c009bSjjc probe_stats->probe_cost_total += probe_stats->flush_cost; 24832e2c009bSjjc } 24842e2c009bSjjc 24852e2c009bSjjc /* 24862e2c009bSjjc * Probe from current CPU to given memory using specified operation 24872e2c009bSjjc * and take specified number of samples 24882e2c009bSjjc */ 24892e2c009bSjjc max = 0; 24902e2c009bSjjc min = -1; 24912e2c009bSjjc for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 24922e2c009bSjjc probe_stats->probe_cost = gethrtime(); 24932e2c009bSjjc 24942e2c009bSjjc /* 24952e2c009bSjjc * Can't measure probe time if gethrtime() isn't working yet 24962e2c009bSjjc */ 24972e2c009bSjjc if (probe_stats->probe_cost == 0 && gethrtime() == 0) 24982e2c009bSjjc return (0); 24992e2c009bSjjc 25002e2c009bSjjc if (lgrp_plat_probe_flags & LGRP_PLAT_PROBE_VENDOR) { 25012e2c009bSjjc /* 25022e2c009bSjjc * Measure how long it takes to read vendor ID from 25032e2c009bSjjc * Northbridge 25042e2c009bSjjc */ 25052e2c009bSjjc elapsed = opt_probe_vendor(to, lgrp_plat_probe_nreads); 25062e2c009bSjjc } else { 25072e2c009bSjjc /* 25082e2c009bSjjc * Measure how long it takes to copy page 25092e2c009bSjjc * on top of itself 25102e2c009bSjjc */ 25112e2c009bSjjc buf = probe_mem_config->probe_va[to] + (i * PAGESIZE); 25122e2c009bSjjc 25132e2c009bSjjc kpreempt_disable(); 25142e2c009bSjjc ipl = splhigh(); 25152e2c009bSjjc start = gethrtime(); 25162e2c009bSjjc if (use_sse_pagecopy) 25172e2c009bSjjc hwblkpagecopy(buf, buf); 25182e2c009bSjjc else 25192e2c009bSjjc bcopy(buf, buf, PAGESIZE); 25202e2c009bSjjc end = gethrtime(); 25212e2c009bSjjc elapsed = end - start; 25222e2c009bSjjc splx(ipl); 25232e2c009bSjjc kpreempt_enable(); 25242e2c009bSjjc } 25252e2c009bSjjc 25262e2c009bSjjc probe_stats->probe_cost = gethrtime() - 25272e2c009bSjjc probe_stats->probe_cost; 25282e2c009bSjjc probe_stats->probe_cost_total += probe_stats->probe_cost; 25292e2c009bSjjc 25302e2c009bSjjc if (min == -1 || elapsed < min) 25312e2c009bSjjc min = elapsed; 25322e2c009bSjjc if (elapsed > max) 25332e2c009bSjjc max = elapsed; 25342e2c009bSjjc } 25352e2c009bSjjc 25362e2c009bSjjc /* 25372e2c009bSjjc * Update minimum and maximum probe times between 25382e2c009bSjjc * these two nodes 25392e2c009bSjjc */ 25402e2c009bSjjc if (min < probe_stats->probe_min[from][to] || 25412e2c009bSjjc probe_stats->probe_min[from][to] == 0) 25422e2c009bSjjc probe_stats->probe_min[from][to] = min; 25432e2c009bSjjc 25442e2c009bSjjc if (max > probe_stats->probe_max[from][to]) 25452e2c009bSjjc probe_stats->probe_max[from][to] = max; 25462e2c009bSjjc 25472e2c009bSjjc return (min); 25482e2c009bSjjc } 25492e2c009bSjjc 25502e2c009bSjjc 25512e2c009bSjjc /* 2552d821f0f0Sjjc * Read boot property with CPU to APIC ID array, fill in CPU to node ID 2553d5d7cf4eSJonathan Chew * mapping table with APIC ID for each CPU (if pointer to table isn't NULL), 2554d5d7cf4eSJonathan Chew * and return number of CPU APIC IDs. 2555dae2fa37Sjjc * 2556dae2fa37Sjjc * NOTE: This code assumes that CPU IDs are assigned in order that they appear 2557dae2fa37Sjjc * in in cpu_apicid_array boot property which is based on and follows 2558dae2fa37Sjjc * same ordering as processor list in ACPI MADT. If the code in 2559dae2fa37Sjjc * usr/src/uts/i86pc/io/pcplusmp/apic.c that reads MADT and assigns 2560dae2fa37Sjjc * CPU IDs ever changes, then this code will need to change too.... 2561dae2fa37Sjjc */ 2562dae2fa37Sjjc static int 2563d821f0f0Sjjc lgrp_plat_process_cpu_apicids(cpu_node_map_t *cpu_node) 2564dae2fa37Sjjc { 2565d821f0f0Sjjc int boot_prop_len; 2566dae2fa37Sjjc char *boot_prop_name = BP_CPU_APICID_ARRAY; 2567*5d02e7e8SToomas Soome uint32_t *cpu_apicid_array; 2568dae2fa37Sjjc int i; 2569d821f0f0Sjjc int n; 2570dae2fa37Sjjc 2571dae2fa37Sjjc /* 2572dae2fa37Sjjc * Check length of property value 2573dae2fa37Sjjc */ 2574dae2fa37Sjjc boot_prop_len = BOP_GETPROPLEN(bootops, boot_prop_name); 2575*5d02e7e8SToomas Soome if (boot_prop_len <= 0) 2576d5d7cf4eSJonathan Chew return (-1); 2577d821f0f0Sjjc 2578d821f0f0Sjjc /* 2579a3114836SGerry Liu * Calculate number of entries in array and return when the system is 2580a3114836SGerry Liu * not very interesting for NUMA. It's not interesting for NUMA if 2581a3114836SGerry Liu * system has only one CPU and doesn't support CPU hotplug. 2582d821f0f0Sjjc */ 2583*5d02e7e8SToomas Soome n = boot_prop_len / sizeof (*cpu_apicid_array); 2584a3114836SGerry Liu if (n == 1 && !plat_dr_support_cpu()) 2585d5d7cf4eSJonathan Chew return (-2); 2586dae2fa37Sjjc 2587*5d02e7e8SToomas Soome cpu_apicid_array = (uint32_t *)BOP_ALLOC(bootops, NULL, boot_prop_len, 2588*5d02e7e8SToomas Soome sizeof (*cpu_apicid_array)); 2589dae2fa37Sjjc /* 2590dae2fa37Sjjc * Get CPU to APIC ID property value 2591dae2fa37Sjjc */ 2592*5d02e7e8SToomas Soome if (cpu_apicid_array == NULL || 2593*5d02e7e8SToomas Soome BOP_GETPROP(bootops, boot_prop_name, cpu_apicid_array) < 0) 2594d5d7cf4eSJonathan Chew return (-3); 2595d5d7cf4eSJonathan Chew 2596d5d7cf4eSJonathan Chew /* 2597d5d7cf4eSJonathan Chew * Just return number of CPU APIC IDs if CPU to node mapping table is 2598d5d7cf4eSJonathan Chew * NULL 2599d5d7cf4eSJonathan Chew */ 2600a3114836SGerry Liu if (cpu_node == NULL) { 2601a3114836SGerry Liu if (plat_dr_support_cpu() && n >= boot_ncpus) { 2602a3114836SGerry Liu return (boot_ncpus); 2603a3114836SGerry Liu } else { 2604d5d7cf4eSJonathan Chew return (n); 2605a3114836SGerry Liu } 2606a3114836SGerry Liu } 2607dae2fa37Sjjc 2608dae2fa37Sjjc /* 2609dae2fa37Sjjc * Fill in CPU to node ID mapping table with APIC ID for each CPU 2610dae2fa37Sjjc */ 2611d821f0f0Sjjc for (i = 0; i < n; i++) { 2612a3114836SGerry Liu /* Only add boot CPUs into the map if CPU DR is enabled. */ 2613a3114836SGerry Liu if (plat_dr_support_cpu() && i >= boot_ncpus) 2614a3114836SGerry Liu break; 2615dae2fa37Sjjc cpu_node[i].exists = 1; 2616dae2fa37Sjjc cpu_node[i].apicid = cpu_apicid_array[i]; 2617a3114836SGerry Liu cpu_node[i].prox_domain = UINT32_MAX; 2618a3114836SGerry Liu cpu_node[i].node = UINT_MAX; 2619dae2fa37Sjjc } 2620dae2fa37Sjjc 2621d821f0f0Sjjc /* 2622d821f0f0Sjjc * Return number of CPUs based on number of APIC IDs 2623d821f0f0Sjjc */ 2624a3114836SGerry Liu return (i); 2625dae2fa37Sjjc } 2626dae2fa37Sjjc 2627dae2fa37Sjjc 2628dae2fa37Sjjc /* 26292e2c009bSjjc * Read ACPI System Locality Information Table (SLIT) to determine how far each 26302e2c009bSjjc * NUMA node is from each other 26312e2c009bSjjc */ 26322e2c009bSjjc static int 2633*5d02e7e8SToomas Soome lgrp_plat_process_slit(ACPI_TABLE_SLIT *tp, 2634a3114836SGerry Liu node_domain_map_t *node_domain, uint_t node_cnt, 2635a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info, lgrp_plat_latency_stats_t *lat_stats) 26362e2c009bSjjc { 26372e2c009bSjjc int i; 26382e2c009bSjjc int j; 2639a3114836SGerry Liu int src; 2640a3114836SGerry Liu int dst; 26412e2c009bSjjc int localities; 26422e2c009bSjjc hrtime_t max; 26432e2c009bSjjc hrtime_t min; 26442e2c009bSjjc int retval; 26452e2c009bSjjc uint8_t *slit_entries; 26462e2c009bSjjc 26472e2c009bSjjc if (tp == NULL || !lgrp_plat_slit_enable) 26482e2c009bSjjc return (1); 26492e2c009bSjjc 26502e2c009bSjjc if (lat_stats == NULL) 26512e2c009bSjjc return (2); 26522e2c009bSjjc 2653*5d02e7e8SToomas Soome localities = tp->LocalityCount; 26542e2c009bSjjc 26552e2c009bSjjc min = lat_stats->latency_min; 26562e2c009bSjjc max = lat_stats->latency_max; 26572e2c009bSjjc 26582e2c009bSjjc /* 26592e2c009bSjjc * Fill in latency matrix based on SLIT entries 26602e2c009bSjjc */ 2661*5d02e7e8SToomas Soome slit_entries = tp->Entry; 26622e2c009bSjjc for (i = 0; i < localities; i++) { 2663a3114836SGerry Liu src = lgrp_plat_domain_to_node(node_domain, 2664a3114836SGerry Liu node_cnt, i); 2665a3114836SGerry Liu if (src == -1) 2666a3114836SGerry Liu continue; 2667a3114836SGerry Liu 26682e2c009bSjjc for (j = 0; j < localities; j++) { 26692e2c009bSjjc uint8_t latency; 26702e2c009bSjjc 2671a3114836SGerry Liu dst = lgrp_plat_domain_to_node(node_domain, 2672a3114836SGerry Liu node_cnt, j); 2673a3114836SGerry Liu if (dst == -1) 2674a3114836SGerry Liu continue; 2675a3114836SGerry Liu 26762e2c009bSjjc latency = slit_entries[(i * localities) + j]; 2677a3114836SGerry Liu lat_stats->latencies[src][dst] = latency; 26785b7cf7f0Sjjc if (latency < min || min == -1) 26792e2c009bSjjc min = latency; 26802e2c009bSjjc if (latency > max) 26812e2c009bSjjc max = latency; 26822e2c009bSjjc } 26832e2c009bSjjc } 26842e2c009bSjjc 26852e2c009bSjjc /* 26862e2c009bSjjc * Verify that latencies/distances given in SLIT look reasonable 26872e2c009bSjjc */ 2688a3114836SGerry Liu retval = lgrp_plat_latency_verify(memnode_info, lat_stats); 26892e2c009bSjjc 26902e2c009bSjjc if (retval) { 26912e2c009bSjjc /* 26922e2c009bSjjc * Reinitialize (zero) latency table since SLIT doesn't look 26932e2c009bSjjc * right 26942e2c009bSjjc */ 26952e2c009bSjjc for (i = 0; i < localities; i++) { 26962e2c009bSjjc for (j = 0; j < localities; j++) 26972e2c009bSjjc lat_stats->latencies[i][j] = 0; 26982e2c009bSjjc } 26992e2c009bSjjc } else { 27002e2c009bSjjc /* 27012e2c009bSjjc * Update min and max latencies seen since SLIT looks valid 27022e2c009bSjjc */ 27032e2c009bSjjc lat_stats->latency_min = min; 27042e2c009bSjjc lat_stats->latency_max = max; 27052e2c009bSjjc } 27062e2c009bSjjc 27072e2c009bSjjc return (retval); 27082e2c009bSjjc } 27092e2c009bSjjc 27102e2c009bSjjc 27112e2c009bSjjc /* 2712a3114836SGerry Liu * Update lgrp latencies according to information returned by ACPI _SLI method. 2713a3114836SGerry Liu */ 2714a3114836SGerry Liu static int 2715a3114836SGerry Liu lgrp_plat_process_sli(uint32_t domain_id, uchar_t *sli_info, 2716a3114836SGerry Liu uint32_t sli_cnt, node_domain_map_t *node_domain, uint_t node_cnt, 2717a3114836SGerry Liu lgrp_plat_latency_stats_t *lat_stats) 2718a3114836SGerry Liu { 2719a3114836SGerry Liu int i; 2720a3114836SGerry Liu int src, dst; 2721a3114836SGerry Liu uint8_t latency; 2722a3114836SGerry Liu hrtime_t max, min; 2723a3114836SGerry Liu 2724a3114836SGerry Liu if (lat_stats == NULL || sli_info == NULL || 2725a3114836SGerry Liu sli_cnt == 0 || domain_id >= sli_cnt) 2726a3114836SGerry Liu return (-1); 2727a3114836SGerry Liu 2728a3114836SGerry Liu src = lgrp_plat_domain_to_node(node_domain, node_cnt, domain_id); 2729a3114836SGerry Liu if (src == -1) { 2730a3114836SGerry Liu src = lgrp_plat_node_domain_update(node_domain, node_cnt, 2731a3114836SGerry Liu domain_id); 2732a3114836SGerry Liu if (src == -1) 2733a3114836SGerry Liu return (-1); 2734a3114836SGerry Liu } 2735a3114836SGerry Liu 2736a3114836SGerry Liu /* 2737a3114836SGerry Liu * Don't update latency info if topology has been flattened to 2 levels. 2738a3114836SGerry Liu */ 2739a3114836SGerry Liu if (lgrp_plat_topo_flatten != 0) { 2740a3114836SGerry Liu return (0); 2741a3114836SGerry Liu } 2742a3114836SGerry Liu 2743a3114836SGerry Liu /* 2744a3114836SGerry Liu * Latency information for proximity domain is ready. 2745a3114836SGerry Liu * TODO: support adjusting latency information at runtime. 2746a3114836SGerry Liu */ 2747a3114836SGerry Liu if (lat_stats->latencies[src][src] != 0) { 2748a3114836SGerry Liu return (0); 2749a3114836SGerry Liu } 2750a3114836SGerry Liu 2751a3114836SGerry Liu /* Validate latency information. */ 2752a3114836SGerry Liu for (i = 0; i < sli_cnt; i++) { 2753a3114836SGerry Liu if (i == domain_id) { 2754a3114836SGerry Liu if (sli_info[i] != ACPI_SLIT_SELF_LATENCY || 2755a3114836SGerry Liu sli_info[sli_cnt + i] != ACPI_SLIT_SELF_LATENCY) { 2756a3114836SGerry Liu return (-1); 2757a3114836SGerry Liu } 2758a3114836SGerry Liu } else { 2759a3114836SGerry Liu if (sli_info[i] <= ACPI_SLIT_SELF_LATENCY || 2760a3114836SGerry Liu sli_info[sli_cnt + i] <= ACPI_SLIT_SELF_LATENCY || 2761a3114836SGerry Liu sli_info[i] != sli_info[sli_cnt + i]) { 2762a3114836SGerry Liu return (-1); 2763a3114836SGerry Liu } 2764a3114836SGerry Liu } 2765a3114836SGerry Liu } 2766a3114836SGerry Liu 2767a3114836SGerry Liu min = lat_stats->latency_min; 2768a3114836SGerry Liu max = lat_stats->latency_max; 2769a3114836SGerry Liu for (i = 0; i < sli_cnt; i++) { 2770a3114836SGerry Liu dst = lgrp_plat_domain_to_node(node_domain, node_cnt, i); 2771a3114836SGerry Liu if (dst == -1) 2772a3114836SGerry Liu continue; 2773a3114836SGerry Liu 2774a3114836SGerry Liu ASSERT(sli_info[i] == sli_info[sli_cnt + i]); 2775a3114836SGerry Liu 2776a3114836SGerry Liu /* Update row in latencies matrix. */ 2777a3114836SGerry Liu latency = sli_info[i]; 2778a3114836SGerry Liu lat_stats->latencies[src][dst] = latency; 2779a3114836SGerry Liu if (latency < min || min == -1) 2780a3114836SGerry Liu min = latency; 2781a3114836SGerry Liu if (latency > max) 2782a3114836SGerry Liu max = latency; 2783a3114836SGerry Liu 2784a3114836SGerry Liu /* Update column in latencies matrix. */ 2785a3114836SGerry Liu latency = sli_info[sli_cnt + i]; 2786a3114836SGerry Liu lat_stats->latencies[dst][src] = latency; 2787a3114836SGerry Liu if (latency < min || min == -1) 2788a3114836SGerry Liu min = latency; 2789a3114836SGerry Liu if (latency > max) 2790a3114836SGerry Liu max = latency; 2791a3114836SGerry Liu } 2792a3114836SGerry Liu lat_stats->latency_min = min; 2793a3114836SGerry Liu lat_stats->latency_max = max; 2794a3114836SGerry Liu 2795a3114836SGerry Liu return (0); 2796a3114836SGerry Liu } 2797a3114836SGerry Liu 2798a3114836SGerry Liu 2799a3114836SGerry Liu /* 28002e2c009bSjjc * Read ACPI System Resource Affinity Table (SRAT) to determine which CPUs 2801d821f0f0Sjjc * and memory are local to each other in the same NUMA node and return number 2802d821f0f0Sjjc * of nodes 28032e2c009bSjjc */ 28042e2c009bSjjc static int 2805*5d02e7e8SToomas Soome lgrp_plat_process_srat(ACPI_TABLE_SRAT *tp, ACPI_TABLE_MSCT *mp, 2806a3114836SGerry Liu uint32_t *prox_domain_min, node_domain_map_t *node_domain, 2807a3114836SGerry Liu cpu_node_map_t *cpu_node, int cpu_count, 2808a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info) 28092e2c009bSjjc { 2810*5d02e7e8SToomas Soome ACPI_SUBTABLE_HEADER *item, *srat_end; 28112e2c009bSjjc int i; 2812d821f0f0Sjjc int node_cnt; 2813dae2fa37Sjjc int proc_entry_count; 2814a3114836SGerry Liu int rc; 28152e2c009bSjjc 2816d821f0f0Sjjc /* 2817d821f0f0Sjjc * Nothing to do when no SRAT or disabled 2818d821f0f0Sjjc */ 28192e2c009bSjjc if (tp == NULL || !lgrp_plat_srat_enable) 2820d821f0f0Sjjc return (-1); 28212e2c009bSjjc 28222e2c009bSjjc /* 2823a3114836SGerry Liu * Try to get domain information from MSCT table. 2824a3114836SGerry Liu * ACPI4.0: OSPM will use information provided by the MSCT only 2825a3114836SGerry Liu * when the System Resource Affinity Table (SRAT) exists. 2826a3114836SGerry Liu */ 2827a3114836SGerry Liu node_cnt = lgrp_plat_msct_domains(mp, prox_domain_min); 2828a3114836SGerry Liu if (node_cnt <= 0) { 2829a3114836SGerry Liu /* 2830a3114836SGerry Liu * Determine number of nodes by counting number of proximity 2831a3114836SGerry Liu * domains in SRAT. 28322e2c009bSjjc */ 283381d9ccb6SJonathan Chew node_cnt = lgrp_plat_srat_domains(tp, prox_domain_min); 2834a3114836SGerry Liu } 2835a3114836SGerry Liu /* 2836a3114836SGerry Liu * Return if number of nodes is 1 or less since don't need to read SRAT. 2837a3114836SGerry Liu */ 2838d821f0f0Sjjc if (node_cnt == 1) 2839d821f0f0Sjjc return (1); 2840d821f0f0Sjjc else if (node_cnt <= 0) 2841d821f0f0Sjjc return (-2); 28422e2c009bSjjc 28432e2c009bSjjc /* 28442e2c009bSjjc * Walk through SRAT, examining each CPU and memory entry to determine 28452e2c009bSjjc * which CPUs and memory belong to which node. 28462e2c009bSjjc */ 2847*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp)); 2848*5d02e7e8SToomas Soome srat_end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp); 2849dae2fa37Sjjc proc_entry_count = 0; 28505b7cf7f0Sjjc while (item < srat_end) { 28512e2c009bSjjc uint32_t apic_id; 28522e2c009bSjjc uint32_t domain; 28532e2c009bSjjc uint64_t end; 28542e2c009bSjjc uint64_t length; 28552e2c009bSjjc uint64_t start; 28562e2c009bSjjc 2857*5d02e7e8SToomas Soome switch (item->Type) { 2858*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_CPU_AFFINITY: { /* CPU entry */ 2859*5d02e7e8SToomas Soome ACPI_SRAT_CPU_AFFINITY *cpu = 2860*5d02e7e8SToomas Soome (ACPI_SRAT_CPU_AFFINITY *) item; 2861*5d02e7e8SToomas Soome 2862*5d02e7e8SToomas Soome if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED) || 28632e2c009bSjjc cpu_node == NULL) 28642e2c009bSjjc break; 28652e2c009bSjjc 28662e2c009bSjjc /* 28672e2c009bSjjc * Calculate domain (node) ID and fill in APIC ID to 28682e2c009bSjjc * domain/node mapping table 28692e2c009bSjjc */ 2870*5d02e7e8SToomas Soome domain = cpu->ProximityDomainLo; 28712e2c009bSjjc for (i = 0; i < 3; i++) { 2872*5d02e7e8SToomas Soome domain += cpu->ProximityDomainHi[i] << 28732e2c009bSjjc ((i + 1) * 8); 28742e2c009bSjjc } 2875*5d02e7e8SToomas Soome apic_id = cpu->ApicId; 28762e2c009bSjjc 2877a3114836SGerry Liu rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, 2878a3114836SGerry Liu cpu_node, cpu_count, apic_id, domain); 2879a3114836SGerry Liu if (rc < 0) 2880d821f0f0Sjjc return (-3); 2881a3114836SGerry Liu else if (rc == 0) 2882dae2fa37Sjjc proc_entry_count++; 28832e2c009bSjjc break; 2884*5d02e7e8SToomas Soome } 2885*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_MEMORY_AFFINITY: { /* memory entry */ 2886*5d02e7e8SToomas Soome ACPI_SRAT_MEM_AFFINITY *mem = 2887*5d02e7e8SToomas Soome (ACPI_SRAT_MEM_AFFINITY *)item; 28882e2c009bSjjc 2889*5d02e7e8SToomas Soome if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED) || 2890a3114836SGerry Liu memnode_info == NULL) 28912e2c009bSjjc break; 28922e2c009bSjjc 28932e2c009bSjjc /* 28942e2c009bSjjc * Get domain (node) ID and fill in domain/node 28952e2c009bSjjc * to memory mapping table 28962e2c009bSjjc */ 2897*5d02e7e8SToomas Soome domain = mem->ProximityDomain; 2898*5d02e7e8SToomas Soome start = mem->BaseAddress; 2899*5d02e7e8SToomas Soome length = mem->Length; 29002e2c009bSjjc end = start + length - 1; 29012e2c009bSjjc 2902a3114836SGerry Liu /* 2903a3114836SGerry Liu * According to ACPI 4.0, both ENABLE and HOTPLUG flags 2904a3114836SGerry Liu * may be set for memory address range entries in SRAT 2905a3114836SGerry Liu * table which are reserved for memory hot plug. 2906a3114836SGerry Liu * We intersect memory address ranges in SRAT table 2907a3114836SGerry Liu * with memory ranges in physinstalled to filter out 2908a3114836SGerry Liu * memory address ranges reserved for hot plug. 2909a3114836SGerry Liu */ 2910*5d02e7e8SToomas Soome if (mem->Flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { 2911a3114836SGerry Liu uint64_t rstart = UINT64_MAX; 2912a3114836SGerry Liu uint64_t rend = 0; 2913a3114836SGerry Liu struct memlist *ml; 2914a3114836SGerry Liu extern struct bootops *bootops; 2915a3114836SGerry Liu 2916a3114836SGerry Liu memlist_read_lock(); 2917a3114836SGerry Liu for (ml = bootops->boot_mem->physinstalled; 2918a3114836SGerry Liu ml; ml = ml->ml_next) { 2919a3114836SGerry Liu uint64_t tstart = ml->ml_address; 2920a3114836SGerry Liu uint64_t tend; 2921a3114836SGerry Liu 2922a3114836SGerry Liu tend = ml->ml_address + ml->ml_size; 2923a3114836SGerry Liu if (tstart > end || tend < start) 2924a3114836SGerry Liu continue; 2925a3114836SGerry Liu if (start > tstart) 2926a3114836SGerry Liu tstart = start; 2927a3114836SGerry Liu if (rstart > tstart) 2928a3114836SGerry Liu rstart = tstart; 2929a3114836SGerry Liu if (end < tend) 2930a3114836SGerry Liu tend = end; 2931a3114836SGerry Liu if (rend < tend) 2932a3114836SGerry Liu rend = tend; 2933a3114836SGerry Liu } 2934a3114836SGerry Liu memlist_read_unlock(); 2935a3114836SGerry Liu start = rstart; 2936a3114836SGerry Liu end = rend; 2937a3114836SGerry Liu /* Skip this entry if no memory installed. */ 2938a3114836SGerry Liu if (start > end) 2939a3114836SGerry Liu break; 2940a3114836SGerry Liu } 2941a3114836SGerry Liu 2942a3114836SGerry Liu if (lgrp_plat_memnode_info_update(node_domain, 2943a3114836SGerry Liu node_cnt, memnode_info, node_cnt, 2944a3114836SGerry Liu start, end, domain, ACPI_MEMNODE_DEVID_BOOT) < 0) 2945d821f0f0Sjjc return (-4); 29462e2c009bSjjc break; 2947*5d02e7e8SToomas Soome } 2948*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: { /* x2apic CPU */ 2949*5d02e7e8SToomas Soome ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu = 2950*5d02e7e8SToomas Soome (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item; 2951a3114836SGerry Liu 2952*5d02e7e8SToomas Soome if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED) || 2953b6917abeSmishra cpu_node == NULL) 2954b6917abeSmishra break; 2955b6917abeSmishra 2956b6917abeSmishra /* 2957b6917abeSmishra * Calculate domain (node) ID and fill in APIC ID to 2958b6917abeSmishra * domain/node mapping table 2959b6917abeSmishra */ 2960*5d02e7e8SToomas Soome domain = x2cpu->ProximityDomain; 2961*5d02e7e8SToomas Soome apic_id = x2cpu->ApicId; 2962b6917abeSmishra 2963a3114836SGerry Liu rc = lgrp_plat_cpu_node_update(node_domain, node_cnt, 2964a3114836SGerry Liu cpu_node, cpu_count, apic_id, domain); 2965a3114836SGerry Liu if (rc < 0) 2966b6917abeSmishra return (-3); 2967a3114836SGerry Liu else if (rc == 0) 2968b6917abeSmishra proc_entry_count++; 2969b6917abeSmishra break; 2970*5d02e7e8SToomas Soome } 29712e2c009bSjjc default: 29722e2c009bSjjc break; 29732e2c009bSjjc } 29742e2c009bSjjc 2975*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length); 29762e2c009bSjjc } 2977dae2fa37Sjjc 2978dae2fa37Sjjc /* 2979dae2fa37Sjjc * Should have seen at least as many SRAT processor entries as CPUs 2980dae2fa37Sjjc */ 2981d821f0f0Sjjc if (proc_entry_count < cpu_count) 2982d821f0f0Sjjc return (-5); 2983dae2fa37Sjjc 298481d9ccb6SJonathan Chew /* 298581d9ccb6SJonathan Chew * Need to sort nodes by starting physical address since VM system 298681d9ccb6SJonathan Chew * assumes and expects memnodes to be sorted in ascending order by 298781d9ccb6SJonathan Chew * physical address 298881d9ccb6SJonathan Chew */ 298981d9ccb6SJonathan Chew lgrp_plat_node_sort(node_domain, node_cnt, cpu_node, cpu_count, 2990a3114836SGerry Liu memnode_info); 299181d9ccb6SJonathan Chew 2992d821f0f0Sjjc return (node_cnt); 29932e2c009bSjjc } 29942e2c009bSjjc 29952e2c009bSjjc 29962e2c009bSjjc /* 2997d5d7cf4eSJonathan Chew * Allocate permanent memory for any temporary memory that we needed to 2998d5d7cf4eSJonathan Chew * allocate using BOP_ALLOC() before kmem_alloc() and VM system were 2999d5d7cf4eSJonathan Chew * initialized and copy everything from temporary to permanent memory since 3000d5d7cf4eSJonathan Chew * temporary boot memory will eventually be released during boot 3001d5d7cf4eSJonathan Chew */ 3002d5d7cf4eSJonathan Chew static void 3003d5d7cf4eSJonathan Chew lgrp_plat_release_bootstrap(void) 3004d5d7cf4eSJonathan Chew { 3005d5d7cf4eSJonathan Chew void *buf; 3006d5d7cf4eSJonathan Chew size_t size; 3007d5d7cf4eSJonathan Chew 3008d5d7cf4eSJonathan Chew if (lgrp_plat_cpu_node_nentries > 0) { 3009d5d7cf4eSJonathan Chew size = lgrp_plat_cpu_node_nentries * sizeof (cpu_node_map_t); 3010d5d7cf4eSJonathan Chew buf = kmem_alloc(size, KM_SLEEP); 3011d5d7cf4eSJonathan Chew bcopy(lgrp_plat_cpu_node, buf, size); 3012d5d7cf4eSJonathan Chew lgrp_plat_cpu_node = buf; 3013d5d7cf4eSJonathan Chew } 3014d5d7cf4eSJonathan Chew } 3015d5d7cf4eSJonathan Chew 3016d5d7cf4eSJonathan Chew 3017d5d7cf4eSJonathan Chew /* 30182e2c009bSjjc * Return number of proximity domains given in ACPI SRAT 30192e2c009bSjjc */ 30202e2c009bSjjc static int 3021*5d02e7e8SToomas Soome lgrp_plat_srat_domains(ACPI_TABLE_SRAT *tp, uint32_t *prox_domain_min) 30222e2c009bSjjc { 30232e2c009bSjjc int domain_cnt; 302481d9ccb6SJonathan Chew uint32_t domain_min; 3025*5d02e7e8SToomas Soome ACPI_SUBTABLE_HEADER *item, *end; 30262e2c009bSjjc int i; 30272e2c009bSjjc node_domain_map_t node_domain[MAX_NODES]; 30282e2c009bSjjc 30292e2c009bSjjc 30302e2c009bSjjc if (tp == NULL || !lgrp_plat_srat_enable) 30312e2c009bSjjc return (1); 30322e2c009bSjjc 30332e2c009bSjjc /* 303481d9ccb6SJonathan Chew * Walk through SRAT to find minimum proximity domain ID 303581d9ccb6SJonathan Chew */ 303681d9ccb6SJonathan Chew domain_min = UINT32_MAX; 3037*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp)); 3038*5d02e7e8SToomas Soome end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp); 303981d9ccb6SJonathan Chew while (item < end) { 304081d9ccb6SJonathan Chew uint32_t domain; 304181d9ccb6SJonathan Chew 3042*5d02e7e8SToomas Soome switch (item->Type) { 3043*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_CPU_AFFINITY: { /* CPU entry */ 3044*5d02e7e8SToomas Soome ACPI_SRAT_CPU_AFFINITY *cpu = 3045*5d02e7e8SToomas Soome (ACPI_SRAT_CPU_AFFINITY *) item; 3046*5d02e7e8SToomas Soome 3047*5d02e7e8SToomas Soome if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) { 3048*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *) 3049*5d02e7e8SToomas Soome ((uintptr_t)item + item->Length); 305081d9ccb6SJonathan Chew continue; 305181d9ccb6SJonathan Chew } 3052*5d02e7e8SToomas Soome domain = cpu->ProximityDomainLo; 305381d9ccb6SJonathan Chew for (i = 0; i < 3; i++) { 3054*5d02e7e8SToomas Soome domain += cpu->ProximityDomainHi[i] << 305581d9ccb6SJonathan Chew ((i + 1) * 8); 305681d9ccb6SJonathan Chew } 305781d9ccb6SJonathan Chew break; 3058*5d02e7e8SToomas Soome } 3059*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_MEMORY_AFFINITY: { /* memory entry */ 3060*5d02e7e8SToomas Soome ACPI_SRAT_MEM_AFFINITY *mem = 3061*5d02e7e8SToomas Soome (ACPI_SRAT_MEM_AFFINITY *)item; 306281d9ccb6SJonathan Chew 3063*5d02e7e8SToomas Soome if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) { 3064*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *) 3065*5d02e7e8SToomas Soome ((uintptr_t)item + item->Length); 306681d9ccb6SJonathan Chew continue; 306781d9ccb6SJonathan Chew } 3068*5d02e7e8SToomas Soome domain = mem->ProximityDomain; 306981d9ccb6SJonathan Chew break; 3070*5d02e7e8SToomas Soome } 3071*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: { /* x2apic CPU */ 3072*5d02e7e8SToomas Soome ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu = 3073*5d02e7e8SToomas Soome (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item; 307481d9ccb6SJonathan Chew 3075*5d02e7e8SToomas Soome if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) { 3076*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *) 3077*5d02e7e8SToomas Soome ((uintptr_t)item + item->Length); 307881d9ccb6SJonathan Chew continue; 307981d9ccb6SJonathan Chew } 3080*5d02e7e8SToomas Soome domain = x2cpu->ProximityDomain; 308181d9ccb6SJonathan Chew break; 3082*5d02e7e8SToomas Soome } 308381d9ccb6SJonathan Chew default: 3084*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + 3085*5d02e7e8SToomas Soome item->Length); 308681d9ccb6SJonathan Chew continue; 308781d9ccb6SJonathan Chew } 308881d9ccb6SJonathan Chew 308981d9ccb6SJonathan Chew /* 309081d9ccb6SJonathan Chew * Keep track of minimum proximity domain ID 309181d9ccb6SJonathan Chew */ 309281d9ccb6SJonathan Chew if (domain < domain_min) 309381d9ccb6SJonathan Chew domain_min = domain; 309481d9ccb6SJonathan Chew 3095*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length); 309681d9ccb6SJonathan Chew } 309781d9ccb6SJonathan Chew if (lgrp_plat_domain_min_enable && prox_domain_min != NULL) 309881d9ccb6SJonathan Chew *prox_domain_min = domain_min; 309981d9ccb6SJonathan Chew 310081d9ccb6SJonathan Chew /* 31012e2c009bSjjc * Walk through SRAT, examining each CPU and memory entry to determine 31022e2c009bSjjc * proximity domain ID for each. 31032e2c009bSjjc */ 31042e2c009bSjjc domain_cnt = 0; 3105*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)tp + sizeof (*tp)); 3106*5d02e7e8SToomas Soome end = (ACPI_SUBTABLE_HEADER *)(tp->Header.Length + (uintptr_t)tp); 31072e2c009bSjjc bzero(node_domain, MAX_NODES * sizeof (node_domain_map_t)); 31082e2c009bSjjc while (item < end) { 31092e2c009bSjjc uint32_t domain; 31102e2c009bSjjc boolean_t overflow; 31112e2c009bSjjc uint_t start; 31122e2c009bSjjc 3113*5d02e7e8SToomas Soome switch (item->Type) { 3114*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_CPU_AFFINITY: { /* CPU entry */ 3115*5d02e7e8SToomas Soome ACPI_SRAT_CPU_AFFINITY *cpu = 3116*5d02e7e8SToomas Soome (ACPI_SRAT_CPU_AFFINITY *) item; 3117*5d02e7e8SToomas Soome 3118*5d02e7e8SToomas Soome if (!(cpu->Flags & ACPI_SRAT_CPU_ENABLED)) { 3119*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *) 3120*5d02e7e8SToomas Soome ((uintptr_t)item + item->Length); 312181d9ccb6SJonathan Chew continue; 312281d9ccb6SJonathan Chew } 3123*5d02e7e8SToomas Soome domain = cpu->ProximityDomainLo; 31242e2c009bSjjc for (i = 0; i < 3; i++) { 3125*5d02e7e8SToomas Soome domain += cpu->ProximityDomainHi[i] << 31262e2c009bSjjc ((i + 1) * 8); 31272e2c009bSjjc } 31282e2c009bSjjc break; 3129*5d02e7e8SToomas Soome } 3130*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_MEMORY_AFFINITY: { /* memory entry */ 3131*5d02e7e8SToomas Soome ACPI_SRAT_MEM_AFFINITY *mem = 3132*5d02e7e8SToomas Soome (ACPI_SRAT_MEM_AFFINITY *)item; 31332e2c009bSjjc 3134*5d02e7e8SToomas Soome if (!(mem->Flags & ACPI_SRAT_MEM_ENABLED)) { 3135*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *) 3136*5d02e7e8SToomas Soome ((uintptr_t)item + item->Length); 313781d9ccb6SJonathan Chew continue; 313881d9ccb6SJonathan Chew } 3139*5d02e7e8SToomas Soome domain = mem->ProximityDomain; 31402e2c009bSjjc break; 3141*5d02e7e8SToomas Soome } 3142*5d02e7e8SToomas Soome case ACPI_SRAT_TYPE_X2APIC_CPU_AFFINITY: { /* x2apic CPU */ 3143*5d02e7e8SToomas Soome ACPI_SRAT_X2APIC_CPU_AFFINITY *x2cpu = 3144*5d02e7e8SToomas Soome (ACPI_SRAT_X2APIC_CPU_AFFINITY *) item; 31452e2c009bSjjc 3146*5d02e7e8SToomas Soome if (!(x2cpu->Flags & ACPI_SRAT_CPU_ENABLED)) { 3147*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *) 3148*5d02e7e8SToomas Soome ((uintptr_t)item + item->Length); 314981d9ccb6SJonathan Chew continue; 315081d9ccb6SJonathan Chew } 3151*5d02e7e8SToomas Soome domain = x2cpu->ProximityDomain; 3152b6917abeSmishra break; 3153*5d02e7e8SToomas Soome } 31542e2c009bSjjc default: 3155*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + 3156*5d02e7e8SToomas Soome item->Length); 315781d9ccb6SJonathan Chew continue; 31582e2c009bSjjc } 31592e2c009bSjjc 31602e2c009bSjjc /* 31612e2c009bSjjc * Count and keep track of which proximity domain IDs seen 31622e2c009bSjjc */ 31632e2c009bSjjc start = i = domain % MAX_NODES; 31642e2c009bSjjc overflow = B_TRUE; 31652e2c009bSjjc do { 31662e2c009bSjjc /* 31672e2c009bSjjc * Create entry for proximity domain and increment 31682e2c009bSjjc * count when no entry exists where proximity domain 31692e2c009bSjjc * hashed 31702e2c009bSjjc */ 31712e2c009bSjjc if (!node_domain[i].exists) { 31722e2c009bSjjc node_domain[i].exists = 1; 31732e2c009bSjjc node_domain[i].prox_domain = domain; 31742e2c009bSjjc domain_cnt++; 31752e2c009bSjjc overflow = B_FALSE; 31762e2c009bSjjc break; 31772e2c009bSjjc } 31782e2c009bSjjc 31792e2c009bSjjc /* 31802e2c009bSjjc * Nothing to do when proximity domain seen already 31812e2c009bSjjc * and its entry exists 31822e2c009bSjjc */ 31832e2c009bSjjc if (node_domain[i].prox_domain == domain) { 31842e2c009bSjjc overflow = B_FALSE; 31852e2c009bSjjc break; 31862e2c009bSjjc } 31872e2c009bSjjc 31882e2c009bSjjc /* 31892e2c009bSjjc * Entry exists where proximity domain hashed, but for 31902e2c009bSjjc * different proximity domain so keep search for empty 31912e2c009bSjjc * slot to put it or matching entry whichever comes 31922e2c009bSjjc * first. 31932e2c009bSjjc */ 31942e2c009bSjjc i = (i + 1) % MAX_NODES; 31952e2c009bSjjc } while (i != start); 31962e2c009bSjjc 31972e2c009bSjjc /* 31982e2c009bSjjc * Didn't find empty or matching entry which means have more 31992e2c009bSjjc * proximity domains than supported nodes (:-( 32002e2c009bSjjc */ 32012e2c009bSjjc ASSERT(overflow != B_TRUE); 32022e2c009bSjjc if (overflow == B_TRUE) 32032e2c009bSjjc return (-1); 32042e2c009bSjjc 3205*5d02e7e8SToomas Soome item = (ACPI_SUBTABLE_HEADER *)((uintptr_t)item + item->Length); 32062e2c009bSjjc } 32072e2c009bSjjc return (domain_cnt); 32082e2c009bSjjc } 32092e2c009bSjjc 32102e2c009bSjjc 32112e2c009bSjjc /* 3212a3114836SGerry Liu * Parse domain information in ACPI Maximum System Capability Table (MSCT). 3213a3114836SGerry Liu * MSCT table has been verified in function process_msct() in fakebop.c. 3214a3114836SGerry Liu */ 3215a3114836SGerry Liu static int 3216*5d02e7e8SToomas Soome lgrp_plat_msct_domains(ACPI_TABLE_MSCT *tp, uint32_t *prox_domain_min) 3217a3114836SGerry Liu { 3218a3114836SGerry Liu int last_seen = 0; 3219a3114836SGerry Liu uint32_t proxmin = UINT32_MAX; 3220*5d02e7e8SToomas Soome ACPI_MSCT_PROXIMITY *item, *end; 3221a3114836SGerry Liu 3222a3114836SGerry Liu if (tp == NULL || lgrp_plat_msct_enable == 0) 3223a3114836SGerry Liu return (-1); 3224a3114836SGerry Liu 3225*5d02e7e8SToomas Soome if (tp->MaxProximityDomains >= MAX_NODES) { 3226a3114836SGerry Liu cmn_err(CE_CONT, 3227a3114836SGerry Liu "?lgrp: too many proximity domains (%d), max %d supported, " 3228a3114836SGerry Liu "disable support of CPU/memory DR operations.", 3229*5d02e7e8SToomas Soome tp->MaxProximityDomains + 1, MAX_NODES); 3230a3114836SGerry Liu plat_dr_disable_cpu(); 3231a3114836SGerry Liu plat_dr_disable_memory(); 3232a3114836SGerry Liu return (-1); 3233a3114836SGerry Liu } 3234a3114836SGerry Liu 3235a3114836SGerry Liu if (prox_domain_min != NULL) { 3236*5d02e7e8SToomas Soome end = (void *)(tp->Header.Length + (uintptr_t)tp); 3237a3114836SGerry Liu for (item = (void *)((uintptr_t)tp + 3238*5d02e7e8SToomas Soome tp->ProximityOffset); item < end; 3239*5d02e7e8SToomas Soome item = (void *)(item->Length + (uintptr_t)item)) { 3240*5d02e7e8SToomas Soome if (item->RangeStart < proxmin) { 3241*5d02e7e8SToomas Soome proxmin = item->RangeStart; 3242a3114836SGerry Liu } 3243a3114836SGerry Liu 3244*5d02e7e8SToomas Soome last_seen = item->RangeEnd - item->RangeStart + 1; 3245a3114836SGerry Liu /* 3246a3114836SGerry Liu * Break out if all proximity domains have been 3247a3114836SGerry Liu * processed. Some BIOSes may have unused items 3248a3114836SGerry Liu * at the end of MSCT table. 3249a3114836SGerry Liu */ 3250*5d02e7e8SToomas Soome if (last_seen > tp->MaxProximityDomains) { 3251a3114836SGerry Liu break; 3252a3114836SGerry Liu } 3253a3114836SGerry Liu } 3254a3114836SGerry Liu *prox_domain_min = proxmin; 3255a3114836SGerry Liu } 3256a3114836SGerry Liu 3257*5d02e7e8SToomas Soome return (tp->MaxProximityDomains + 1); 3258a3114836SGerry Liu } 3259a3114836SGerry Liu 3260a3114836SGerry Liu 3261a3114836SGerry Liu /* 32622e2c009bSjjc * Set lgroup latencies for 2 level lgroup topology 32632e2c009bSjjc */ 32642e2c009bSjjc static void 3265a3114836SGerry Liu lgrp_plat_2level_setup(lgrp_plat_latency_stats_t *lat_stats) 32662e2c009bSjjc { 3267a3114836SGerry Liu int i, j; 32682e2c009bSjjc 3269a3114836SGerry Liu ASSERT(lat_stats != NULL); 32702e2c009bSjjc 32712e2c009bSjjc if (lgrp_plat_node_cnt >= 4) 32722e2c009bSjjc cmn_err(CE_NOTE, 32732e2c009bSjjc "MPO only optimizing for local and remote\n"); 32742e2c009bSjjc for (i = 0; i < lgrp_plat_node_cnt; i++) { 32752e2c009bSjjc for (j = 0; j < lgrp_plat_node_cnt; j++) { 32762e2c009bSjjc if (i == j) 32772e2c009bSjjc lat_stats->latencies[i][j] = 2; 32782e2c009bSjjc else 32792e2c009bSjjc lat_stats->latencies[i][j] = 3; 32802e2c009bSjjc } 32812e2c009bSjjc } 32822e2c009bSjjc lat_stats->latency_min = 2; 32832e2c009bSjjc lat_stats->latency_max = 3; 3284a3114836SGerry Liu /* TODO: check it. */ 32852e2c009bSjjc lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 3286a3114836SGerry Liu lgrp_plat_topo_flatten = 1; 32872e2c009bSjjc } 32882e2c009bSjjc 32892e2c009bSjjc 32902e2c009bSjjc /* 32912e2c009bSjjc * The following Opteron specific constants, macros, types, and routines define 32922e2c009bSjjc * PCI configuration space registers and how to read them to determine the NUMA 32932e2c009bSjjc * configuration of *supported* Opteron processors. They provide the same 32942e2c009bSjjc * information that may be gotten from the ACPI System Resource Affinity Table 32952e2c009bSjjc * (SRAT) if it exists on the machine of interest. 32962e2c009bSjjc * 32972e2c009bSjjc * The AMD BIOS and Kernel Developer's Guide (BKDG) for the processor family 32982e2c009bSjjc * of interest describes all of these registers and their contents. The main 32992e2c009bSjjc * registers used by this code to determine the NUMA configuration of the 33002e2c009bSjjc * machine are the node ID register for the number of NUMA nodes and the DRAM 33012e2c009bSjjc * address map registers for the physical address range of each node. 33022e2c009bSjjc * 33032e2c009bSjjc * NOTE: The format and how to determine the NUMA configuration using PCI 33042e2c009bSjjc * config space registers may change or may not be supported in future 33052e2c009bSjjc * Opteron processor families. 33067c478bd9Sstevel@tonic-gate */ 33077c478bd9Sstevel@tonic-gate 33087c478bd9Sstevel@tonic-gate /* 33097c478bd9Sstevel@tonic-gate * How many bits to shift Opteron DRAM Address Map base and limit registers 33107c478bd9Sstevel@tonic-gate * to get actual value 33117c478bd9Sstevel@tonic-gate */ 3312f78a91cdSjjc #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 3313f78a91cdSjjc #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 33147c478bd9Sstevel@tonic-gate 3315f78a91cdSjjc #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 3316f78a91cdSjjc #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 3317f78a91cdSjjc 3318f78a91cdSjjc #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 3319f78a91cdSjjc 3320f78a91cdSjjc /* 3321f78a91cdSjjc * Macros to derive addresses from Opteron DRAM Address Map registers 3322f78a91cdSjjc */ 3323f78a91cdSjjc #define OPT_DRAMADDR_HI(reg) \ 3324f78a91cdSjjc (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 3325f78a91cdSjjc OPT_DRAMADDR_HI_LSHIFT_ADDR) 3326f78a91cdSjjc 3327f78a91cdSjjc #define OPT_DRAMADDR_LO(reg) \ 3328f78a91cdSjjc (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 3329f78a91cdSjjc OPT_DRAMADDR_LO_LSHIFT_ADDR) 3330f78a91cdSjjc 3331f78a91cdSjjc #define OPT_DRAMADDR(high, low) \ 3332f78a91cdSjjc (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 33337c478bd9Sstevel@tonic-gate 33347c478bd9Sstevel@tonic-gate /* 33357c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map base register 33367c478bd9Sstevel@tonic-gate */ 3337f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 3338f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 3339f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 33407c478bd9Sstevel@tonic-gate 33417c478bd9Sstevel@tonic-gate /* 33427c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map limit register 33437c478bd9Sstevel@tonic-gate */ 3344f78a91cdSjjc #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 3345f78a91cdSjjc #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 33467c478bd9Sstevel@tonic-gate 33477c478bd9Sstevel@tonic-gate 33487c478bd9Sstevel@tonic-gate /* 33497c478bd9Sstevel@tonic-gate * Opteron Node ID register in PCI configuration space contains 33507c478bd9Sstevel@tonic-gate * number of nodes in system, etc. for Opteron K8. The following 33517c478bd9Sstevel@tonic-gate * constants and macros define its contents, structure, and access. 33527c478bd9Sstevel@tonic-gate */ 33537c478bd9Sstevel@tonic-gate 33547c478bd9Sstevel@tonic-gate /* 33557c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron Node ID register 33567c478bd9Sstevel@tonic-gate */ 33577c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_ID 0x7 /* node ID */ 33587c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_CNT 0x70 /* node count */ 33597c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 33607c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 33617c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 33627c478bd9Sstevel@tonic-gate 33637c478bd9Sstevel@tonic-gate /* 33647c478bd9Sstevel@tonic-gate * How many bits in Opteron Node ID register to shift right to get actual value 33657c478bd9Sstevel@tonic-gate */ 33667c478bd9Sstevel@tonic-gate #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 33677c478bd9Sstevel@tonic-gate 33687c478bd9Sstevel@tonic-gate /* 33697c478bd9Sstevel@tonic-gate * Macros to get values from Opteron Node ID register 33707c478bd9Sstevel@tonic-gate */ 33717c478bd9Sstevel@tonic-gate #define OPT_NODE_CNT(reg) \ 33727c478bd9Sstevel@tonic-gate ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 33737c478bd9Sstevel@tonic-gate 3374f78a91cdSjjc /* 3375f78a91cdSjjc * Macro to setup PCI Extended Configuration Space (ECS) address to give to 3376f78a91cdSjjc * "in/out" instructions 3377f78a91cdSjjc * 3378f78a91cdSjjc * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 3379f78a91cdSjjc * other uses should just do MMIO to access PCI ECS. 3380f78a91cdSjjc * Must enable special bit in Northbridge Configuration Register on 3381f78a91cdSjjc * Greyhound for extended CF8 space access to be able to access PCI ECS 3382f78a91cdSjjc * using "in/out" instructions and restore special bit after done 3383f78a91cdSjjc * accessing PCI ECS. 3384f78a91cdSjjc */ 3385f78a91cdSjjc #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 3386f78a91cdSjjc (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 3387f78a91cdSjjc (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 3388f78a91cdSjjc ((((reg) >> 8) & 0xf) << 24)) 33897c478bd9Sstevel@tonic-gate 33907c478bd9Sstevel@tonic-gate /* 33917c478bd9Sstevel@tonic-gate * PCI configuration space registers accessed by specifying 33927c478bd9Sstevel@tonic-gate * a bus, device, function, and offset. The following constants 33937c478bd9Sstevel@tonic-gate * define the values needed to access Opteron K8 configuration 33947c478bd9Sstevel@tonic-gate * info to determine its node topology 33957c478bd9Sstevel@tonic-gate */ 33967c478bd9Sstevel@tonic-gate 33977c478bd9Sstevel@tonic-gate #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 33987c478bd9Sstevel@tonic-gate 33997c478bd9Sstevel@tonic-gate /* 34007c478bd9Sstevel@tonic-gate * Opteron PCI configuration space register function values 34017c478bd9Sstevel@tonic-gate */ 34027c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 34037c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 34047c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 34057c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 34067c478bd9Sstevel@tonic-gate 34077c478bd9Sstevel@tonic-gate /* 34087c478bd9Sstevel@tonic-gate * PCI Configuration Space register offsets 34097c478bd9Sstevel@tonic-gate */ 34107c478bd9Sstevel@tonic-gate #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 3411f78a91cdSjjc #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 3412f78a91cdSjjc #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 34137c478bd9Sstevel@tonic-gate #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 34147c478bd9Sstevel@tonic-gate 34157c478bd9Sstevel@tonic-gate /* 34167c478bd9Sstevel@tonic-gate * Opteron PCI Configuration Space device IDs for nodes 34177c478bd9Sstevel@tonic-gate */ 34187c478bd9Sstevel@tonic-gate #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 34197c478bd9Sstevel@tonic-gate 34207c478bd9Sstevel@tonic-gate 34217c478bd9Sstevel@tonic-gate /* 34227c478bd9Sstevel@tonic-gate * Opteron DRAM address map gives base and limit for physical memory in a node 34237c478bd9Sstevel@tonic-gate */ 34247c478bd9Sstevel@tonic-gate typedef struct opt_dram_addr_map { 3425f78a91cdSjjc uint32_t base_hi; 3426f78a91cdSjjc uint32_t base_lo; 3427f78a91cdSjjc uint32_t limit_hi; 3428f78a91cdSjjc uint32_t limit_lo; 34297c478bd9Sstevel@tonic-gate } opt_dram_addr_map_t; 34307c478bd9Sstevel@tonic-gate 34317c478bd9Sstevel@tonic-gate 34327c478bd9Sstevel@tonic-gate /* 3433f78a91cdSjjc * Supported AMD processor families 3434f78a91cdSjjc */ 3435f78a91cdSjjc #define AMD_FAMILY_HAMMER 15 3436f78a91cdSjjc #define AMD_FAMILY_GREYHOUND 16 34377c478bd9Sstevel@tonic-gate 3438f78a91cdSjjc /* 34392e2c009bSjjc * Whether to have is_opteron() return 1 even when processor isn't supported 3440f78a91cdSjjc */ 3441f78a91cdSjjc uint_t is_opteron_override = 0; 3442f78a91cdSjjc 3443f78a91cdSjjc /* 3444f78a91cdSjjc * AMD processor family for current CPU 3445f78a91cdSjjc */ 34467c478bd9Sstevel@tonic-gate uint_t opt_family = 0; 3447f78a91cdSjjc 34487c478bd9Sstevel@tonic-gate 34497c478bd9Sstevel@tonic-gate /* 3450f78a91cdSjjc * Determine whether we're running on a supported AMD Opteron since reading 3451f78a91cdSjjc * node count and DRAM address map registers may have different format or 34522e2c009bSjjc * may not be supported across processor families 34537c478bd9Sstevel@tonic-gate */ 34542e2c009bSjjc static int 34557c478bd9Sstevel@tonic-gate is_opteron(void) 34567c478bd9Sstevel@tonic-gate { 3457f78a91cdSjjc 34587c478bd9Sstevel@tonic-gate if (x86_vendor != X86_VENDOR_AMD) 34597c478bd9Sstevel@tonic-gate return (0); 34607c478bd9Sstevel@tonic-gate 3461f78a91cdSjjc opt_family = cpuid_getfamily(CPU); 3462f78a91cdSjjc if (opt_family == AMD_FAMILY_HAMMER || 3463f78a91cdSjjc opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 34647c478bd9Sstevel@tonic-gate return (1); 34657c478bd9Sstevel@tonic-gate else 34667c478bd9Sstevel@tonic-gate return (0); 34677c478bd9Sstevel@tonic-gate } 34687c478bd9Sstevel@tonic-gate 34692e2c009bSjjc 34702e2c009bSjjc /* 34712e2c009bSjjc * Determine NUMA configuration for Opteron from registers that live in PCI 34722e2c009bSjjc * configuration space 34732e2c009bSjjc */ 34742e2c009bSjjc static void 34752e2c009bSjjc opt_get_numa_config(uint_t *node_cnt, int *mem_intrlv, 3476a3114836SGerry Liu memnode_phys_addr_map_t *memnode_info) 34777c478bd9Sstevel@tonic-gate { 34787c478bd9Sstevel@tonic-gate uint_t bus; 34797c478bd9Sstevel@tonic-gate uint_t dev; 34802e2c009bSjjc struct opt_dram_addr_map dram_map[MAX_NODES]; 34817c478bd9Sstevel@tonic-gate uint_t node; 34822e2c009bSjjc uint_t node_info[MAX_NODES]; 3483f78a91cdSjjc uint_t off_hi; 3484f78a91cdSjjc uint_t off_lo; 3485f78a91cdSjjc uint64_t nb_cfg_reg; 34867c478bd9Sstevel@tonic-gate 34877c478bd9Sstevel@tonic-gate /* 34887c478bd9Sstevel@tonic-gate * Read configuration registers from PCI configuration space to 34897c478bd9Sstevel@tonic-gate * determine node information, which memory is in each node, etc. 34907c478bd9Sstevel@tonic-gate * 34917c478bd9Sstevel@tonic-gate * Write to PCI configuration space address register to specify 34927c478bd9Sstevel@tonic-gate * which configuration register to read and read/write PCI 34937c478bd9Sstevel@tonic-gate * configuration space data register to get/set contents 34947c478bd9Sstevel@tonic-gate */ 34957c478bd9Sstevel@tonic-gate bus = OPT_PCS_BUS_CONFIG; 34967c478bd9Sstevel@tonic-gate dev = OPT_PCS_DEV_NODE0; 3497f78a91cdSjjc off_hi = OPT_PCS_OFF_DRAMBASE_HI; 3498f78a91cdSjjc off_lo = OPT_PCS_OFF_DRAMBASE_LO; 34997c478bd9Sstevel@tonic-gate 35007c478bd9Sstevel@tonic-gate /* 35017c478bd9Sstevel@tonic-gate * Read node ID register for node 0 to get node count 35027c478bd9Sstevel@tonic-gate */ 35032e2c009bSjjc node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 3504ef50d8c0Sesaxe OPT_PCS_OFF_NODEID); 35052e2c009bSjjc *node_cnt = OPT_NODE_CNT(node_info[0]) + 1; 35062e2c009bSjjc 35072e2c009bSjjc /* 35082e2c009bSjjc * If number of nodes is more than maximum supported, then set node 35092e2c009bSjjc * count to 1 and treat system as UMA instead of NUMA. 35102e2c009bSjjc */ 35112e2c009bSjjc if (*node_cnt > MAX_NODES) { 35122e2c009bSjjc *node_cnt = 1; 35132e2c009bSjjc return; 35142e2c009bSjjc } 35157c478bd9Sstevel@tonic-gate 3516f78a91cdSjjc /* 3517f78a91cdSjjc * For Greyhound, PCI Extended Configuration Space must be enabled to 3518f78a91cdSjjc * read high DRAM address map base and limit registers 3519f78a91cdSjjc */ 3520f78a91cdSjjc if (opt_family == AMD_FAMILY_GREYHOUND) { 3521f78a91cdSjjc nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 3522f78a91cdSjjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3523f78a91cdSjjc wrmsr(MSR_AMD_NB_CFG, 3524f78a91cdSjjc nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 3525f78a91cdSjjc } 3526f78a91cdSjjc 35272e2c009bSjjc for (node = 0; node < *node_cnt; node++) { 3528f78a91cdSjjc uint32_t base_hi; 3529f78a91cdSjjc uint32_t base_lo; 3530f78a91cdSjjc uint32_t limit_hi; 3531f78a91cdSjjc uint32_t limit_lo; 3532f78a91cdSjjc 35337c478bd9Sstevel@tonic-gate /* 35347c478bd9Sstevel@tonic-gate * Read node ID register (except for node 0 which we just read) 35357c478bd9Sstevel@tonic-gate */ 35367c478bd9Sstevel@tonic-gate if (node > 0) { 35372e2c009bSjjc node_info[node] = pci_getl_func(bus, dev, 3538ef50d8c0Sesaxe OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 35397c478bd9Sstevel@tonic-gate } 35407c478bd9Sstevel@tonic-gate 35417c478bd9Sstevel@tonic-gate /* 35427c478bd9Sstevel@tonic-gate * Read DRAM base and limit registers which specify 35437c478bd9Sstevel@tonic-gate * physical memory range of each node 35447c478bd9Sstevel@tonic-gate */ 3545f78a91cdSjjc if (opt_family != AMD_FAMILY_GREYHOUND) 3546f78a91cdSjjc base_hi = 0; 3547f78a91cdSjjc else { 3548f78a91cdSjjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 3549f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 35502e2c009bSjjc base_hi = dram_map[node].base_hi = 3551f78a91cdSjjc inl(PCI_CONFDATA); 3552f78a91cdSjjc } 35532e2c009bSjjc base_lo = dram_map[node].base_lo = pci_getl_func(bus, dev, 3554f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_lo); 3555f78a91cdSjjc 35562e2c009bSjjc if ((dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) && 35572e2c009bSjjc mem_intrlv) 35582e2c009bSjjc *mem_intrlv = *mem_intrlv + 1; 35597c478bd9Sstevel@tonic-gate 3560f78a91cdSjjc off_hi += 4; /* high limit register offset */ 3561f78a91cdSjjc if (opt_family != AMD_FAMILY_GREYHOUND) 3562f78a91cdSjjc limit_hi = 0; 3563f78a91cdSjjc else { 3564f78a91cdSjjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 3565f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 35662e2c009bSjjc limit_hi = dram_map[node].limit_hi = 3567f78a91cdSjjc inl(PCI_CONFDATA); 3568f78a91cdSjjc } 3569f78a91cdSjjc 3570f78a91cdSjjc off_lo += 4; /* low limit register offset */ 35712e2c009bSjjc limit_lo = dram_map[node].limit_lo = pci_getl_func(bus, 3572f78a91cdSjjc dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 35737c478bd9Sstevel@tonic-gate 35747c478bd9Sstevel@tonic-gate /* 3575f78a91cdSjjc * Increment device number to next node and register offsets 3576f78a91cdSjjc * for DRAM base register of next node 35777c478bd9Sstevel@tonic-gate */ 3578f78a91cdSjjc off_hi += 4; 3579f78a91cdSjjc off_lo += 4; 35807c478bd9Sstevel@tonic-gate dev++; 35817c478bd9Sstevel@tonic-gate 35827c478bd9Sstevel@tonic-gate /* 3583a940d195Sjjc * Both read and write enable bits must be enabled in DRAM 3584a940d195Sjjc * address map base register for physical memory to exist in 3585a940d195Sjjc * node 3586a940d195Sjjc */ 3587f78a91cdSjjc if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 3588f78a91cdSjjc (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 3589a940d195Sjjc /* 3590a940d195Sjjc * Mark node memory as non-existent and set start and 3591a3114836SGerry Liu * end addresses to be same in memnode_info[] 3592a940d195Sjjc */ 3593a3114836SGerry Liu memnode_info[node].exists = 0; 3594a3114836SGerry Liu memnode_info[node].start = memnode_info[node].end = 35952e2c009bSjjc (pfn_t)-1; 3596a940d195Sjjc continue; 3597a940d195Sjjc } 3598a940d195Sjjc 3599a940d195Sjjc /* 3600a940d195Sjjc * Mark node memory as existing and remember physical address 3601a940d195Sjjc * range of each node for use later 36027c478bd9Sstevel@tonic-gate */ 3603a3114836SGerry Liu memnode_info[node].exists = 1; 3604f78a91cdSjjc 3605a3114836SGerry Liu memnode_info[node].start = btop(OPT_DRAMADDR(base_hi, base_lo)); 3606f78a91cdSjjc 3607a3114836SGerry Liu memnode_info[node].end = btop(OPT_DRAMADDR(limit_hi, limit_lo) | 3608f78a91cdSjjc OPT_DRAMADDR_LO_MASK_OFF); 3609f78a91cdSjjc } 3610f78a91cdSjjc 3611f78a91cdSjjc /* 3612f78a91cdSjjc * Restore PCI Extended Configuration Space enable bit 3613f78a91cdSjjc */ 3614f78a91cdSjjc if (opt_family == AMD_FAMILY_GREYHOUND) { 3615f78a91cdSjjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 3616f78a91cdSjjc wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 36177c478bd9Sstevel@tonic-gate } 36187c478bd9Sstevel@tonic-gate } 36197c478bd9Sstevel@tonic-gate 36207c478bd9Sstevel@tonic-gate 36217c478bd9Sstevel@tonic-gate /* 36222e2c009bSjjc * Return average amount of time to read vendor ID register on Northbridge 36232e2c009bSjjc * N times on specified destination node from current CPU 36247c478bd9Sstevel@tonic-gate */ 36257c478bd9Sstevel@tonic-gate static hrtime_t 36262e2c009bSjjc opt_probe_vendor(int dest_node, int nreads) 36277c478bd9Sstevel@tonic-gate { 36282e2c009bSjjc int cnt; 36297c478bd9Sstevel@tonic-gate uint_t dev; 36307c478bd9Sstevel@tonic-gate /* LINTED: set but not used in function */ 36317c478bd9Sstevel@tonic-gate volatile uint_t dev_vendor; 36327c478bd9Sstevel@tonic-gate hrtime_t elapsed; 36337c478bd9Sstevel@tonic-gate hrtime_t end; 36347c478bd9Sstevel@tonic-gate int ipl; 36357c478bd9Sstevel@tonic-gate hrtime_t start; 36367c478bd9Sstevel@tonic-gate 36372e2c009bSjjc dev = OPT_PCS_DEV_NODE0 + dest_node; 36387c478bd9Sstevel@tonic-gate kpreempt_disable(); 36397c478bd9Sstevel@tonic-gate ipl = spl8(); 36402e2c009bSjjc outl(PCI_CONFADD, PCI_CADDR1(0, dev, OPT_PCS_FUNC_DRAM, 36417c478bd9Sstevel@tonic-gate OPT_PCS_OFF_VENDOR)); 36427c478bd9Sstevel@tonic-gate start = gethrtime(); 36432e2c009bSjjc for (cnt = 0; cnt < nreads; cnt++) 36447c478bd9Sstevel@tonic-gate dev_vendor = inl(PCI_CONFDATA); 36457c478bd9Sstevel@tonic-gate end = gethrtime(); 36462e2c009bSjjc elapsed = (end - start) / nreads; 36477c478bd9Sstevel@tonic-gate splx(ipl); 36487c478bd9Sstevel@tonic-gate kpreempt_enable(); 36492e2c009bSjjc return (elapsed); 36507c478bd9Sstevel@tonic-gate } 3651