17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5c39996a7Sstevel * Common Development and Distribution License (the "License"). 6c39996a7Sstevel * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 21c39996a7Sstevel 227c478bd9Sstevel@tonic-gate /* 23fb2f18f8Sesaxe * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 277c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate 307c478bd9Sstevel@tonic-gate #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 317c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 32f78a91cdSjjc #include <sys/controlregs.h> 337c478bd9Sstevel@tonic-gate #include <sys/cpupart.h> 347c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 357c478bd9Sstevel@tonic-gate #include <sys/lgrp.h> 367c478bd9Sstevel@tonic-gate #include <sys/machsystm.h> 377c478bd9Sstevel@tonic-gate #include <sys/memlist.h> 387c478bd9Sstevel@tonic-gate #include <sys/memnode.h> 397c478bd9Sstevel@tonic-gate #include <sys/mman.h> 40ef50d8c0Sesaxe #include <sys/pci_cfgspace.h> 41ef50d8c0Sesaxe #include <sys/pci_impl.h> 427c478bd9Sstevel@tonic-gate #include <sys/param.h> 43fb2f18f8Sesaxe #include <sys/pghw.h> 447c478bd9Sstevel@tonic-gate #include <sys/promif.h> /* for prom_printf() */ 457c478bd9Sstevel@tonic-gate #include <sys/systm.h> 467c478bd9Sstevel@tonic-gate #include <sys/thread.h> 477c478bd9Sstevel@tonic-gate #include <sys/types.h> 487c478bd9Sstevel@tonic-gate #include <sys/var.h> 497c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 507c478bd9Sstevel@tonic-gate #include <vm/hat_i86.h> 517c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 52affbd3ccSkchow #include <vm/vm_dep.h> 537c478bd9Sstevel@tonic-gate 547c478bd9Sstevel@tonic-gate 557c478bd9Sstevel@tonic-gate /* 567c478bd9Sstevel@tonic-gate * lgroup platform support for x86 platforms. 577c478bd9Sstevel@tonic-gate */ 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate #define MAX_NODES 8 607c478bd9Sstevel@tonic-gate #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 617c478bd9Sstevel@tonic-gate 62fb2f18f8Sesaxe #define LGRP_PLAT_CPU_TO_NODE(cpu) (pg_plat_hw_instance_id(cpu, PGHW_CHIP)) 637c478bd9Sstevel@tonic-gate 647c478bd9Sstevel@tonic-gate #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 657c478bd9Sstevel@tonic-gate #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 668949bcd6Sandrei #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 677c478bd9Sstevel@tonic-gate 687c478bd9Sstevel@tonic-gate /* 697c478bd9Sstevel@tonic-gate * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 707c478bd9Sstevel@tonic-gate * 71f78a91cdSjjc * Until this code supports reading System Resource Affinity Table (SRAT), 727c478bd9Sstevel@tonic-gate * we need to examine registers in PCI configuration space to determine how 737c478bd9Sstevel@tonic-gate * many nodes are in the system and which CPUs and memory are in each node. 747c478bd9Sstevel@tonic-gate * This could be determined by probing all memory from each CPU, but that is 757c478bd9Sstevel@tonic-gate * too expensive to do while booting the kernel. 767c478bd9Sstevel@tonic-gate * 777c478bd9Sstevel@tonic-gate * NOTE: Using these PCI configuration space registers to determine this 78f78a91cdSjjc * locality info is not guaranteed to work on future generations of 79f78a91cdSjjc * Opteron processor. 807c478bd9Sstevel@tonic-gate */ 817c478bd9Sstevel@tonic-gate 827c478bd9Sstevel@tonic-gate /* 837c478bd9Sstevel@tonic-gate * Opteron DRAM Address Map in PCI configuration space gives base and limit 84f78a91cdSjjc * of physical memory in each node. The following constants and macros define 85f78a91cdSjjc * their contents, structure, and access. 867c478bd9Sstevel@tonic-gate */ 877c478bd9Sstevel@tonic-gate 887c478bd9Sstevel@tonic-gate /* 897c478bd9Sstevel@tonic-gate * How many bits to shift Opteron DRAM Address Map base and limit registers 907c478bd9Sstevel@tonic-gate * to get actual value 917c478bd9Sstevel@tonic-gate */ 92f78a91cdSjjc #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 93f78a91cdSjjc #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 947c478bd9Sstevel@tonic-gate 95f78a91cdSjjc #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 96f78a91cdSjjc #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 97f78a91cdSjjc 98f78a91cdSjjc #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 99f78a91cdSjjc 100f78a91cdSjjc /* 101f78a91cdSjjc * Macros to derive addresses from Opteron DRAM Address Map registers 102f78a91cdSjjc */ 103f78a91cdSjjc #define OPT_DRAMADDR_HI(reg) \ 104f78a91cdSjjc (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 105f78a91cdSjjc OPT_DRAMADDR_HI_LSHIFT_ADDR) 106f78a91cdSjjc 107f78a91cdSjjc #define OPT_DRAMADDR_LO(reg) \ 108f78a91cdSjjc (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 109f78a91cdSjjc OPT_DRAMADDR_LO_LSHIFT_ADDR) 110f78a91cdSjjc 111f78a91cdSjjc #define OPT_DRAMADDR(high, low) \ 112f78a91cdSjjc (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 1137c478bd9Sstevel@tonic-gate 1147c478bd9Sstevel@tonic-gate /* 1157c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map base register 1167c478bd9Sstevel@tonic-gate */ 117f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 118f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 119f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 1207c478bd9Sstevel@tonic-gate 1217c478bd9Sstevel@tonic-gate /* 1227c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map limit register 1237c478bd9Sstevel@tonic-gate */ 124f78a91cdSjjc #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 125f78a91cdSjjc #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 1267c478bd9Sstevel@tonic-gate 1277c478bd9Sstevel@tonic-gate 1287c478bd9Sstevel@tonic-gate /* 1297c478bd9Sstevel@tonic-gate * Opteron Node ID register in PCI configuration space contains 1307c478bd9Sstevel@tonic-gate * number of nodes in system, etc. for Opteron K8. The following 1317c478bd9Sstevel@tonic-gate * constants and macros define its contents, structure, and access. 1327c478bd9Sstevel@tonic-gate */ 1337c478bd9Sstevel@tonic-gate 1347c478bd9Sstevel@tonic-gate /* 1357c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron Node ID register 1367c478bd9Sstevel@tonic-gate */ 1377c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_ID 0x7 /* node ID */ 1387c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_CNT 0x70 /* node count */ 1397c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 1407c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 1417c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 1427c478bd9Sstevel@tonic-gate 1437c478bd9Sstevel@tonic-gate /* 1447c478bd9Sstevel@tonic-gate * How many bits in Opteron Node ID register to shift right to get actual value 1457c478bd9Sstevel@tonic-gate */ 1467c478bd9Sstevel@tonic-gate #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 1477c478bd9Sstevel@tonic-gate 1487c478bd9Sstevel@tonic-gate /* 1497c478bd9Sstevel@tonic-gate * Macros to get values from Opteron Node ID register 1507c478bd9Sstevel@tonic-gate */ 1517c478bd9Sstevel@tonic-gate #define OPT_NODE_CNT(reg) \ 1527c478bd9Sstevel@tonic-gate ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 1537c478bd9Sstevel@tonic-gate 154f78a91cdSjjc /* 155f78a91cdSjjc * Macro to setup PCI Extended Configuration Space (ECS) address to give to 156f78a91cdSjjc * "in/out" instructions 157f78a91cdSjjc * 158f78a91cdSjjc * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 159f78a91cdSjjc * other uses should just do MMIO to access PCI ECS. 160f78a91cdSjjc * Must enable special bit in Northbridge Configuration Register on 161f78a91cdSjjc * Greyhound for extended CF8 space access to be able to access PCI ECS 162f78a91cdSjjc * using "in/out" instructions and restore special bit after done 163f78a91cdSjjc * accessing PCI ECS. 164f78a91cdSjjc */ 165f78a91cdSjjc #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 166f78a91cdSjjc (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 167f78a91cdSjjc (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 168f78a91cdSjjc ((((reg) >> 8) & 0xf) << 24)) 1697c478bd9Sstevel@tonic-gate 1707c478bd9Sstevel@tonic-gate /* 1717c478bd9Sstevel@tonic-gate * PCI configuration space registers accessed by specifying 1727c478bd9Sstevel@tonic-gate * a bus, device, function, and offset. The following constants 1737c478bd9Sstevel@tonic-gate * define the values needed to access Opteron K8 configuration 1747c478bd9Sstevel@tonic-gate * info to determine its node topology 1757c478bd9Sstevel@tonic-gate */ 1767c478bd9Sstevel@tonic-gate 1777c478bd9Sstevel@tonic-gate #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 1787c478bd9Sstevel@tonic-gate 1797c478bd9Sstevel@tonic-gate /* 1807c478bd9Sstevel@tonic-gate * Opteron PCI configuration space register function values 1817c478bd9Sstevel@tonic-gate */ 1827c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 1837c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 1847c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 1857c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 1867c478bd9Sstevel@tonic-gate 1877c478bd9Sstevel@tonic-gate /* 1887c478bd9Sstevel@tonic-gate * PCI Configuration Space register offsets 1897c478bd9Sstevel@tonic-gate */ 1907c478bd9Sstevel@tonic-gate #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 191f78a91cdSjjc #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 192f78a91cdSjjc #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 1937c478bd9Sstevel@tonic-gate #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 1947c478bd9Sstevel@tonic-gate 1957c478bd9Sstevel@tonic-gate /* 1967c478bd9Sstevel@tonic-gate * Opteron PCI Configuration Space device IDs for nodes 1977c478bd9Sstevel@tonic-gate */ 1987c478bd9Sstevel@tonic-gate #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 1997c478bd9Sstevel@tonic-gate 2007c478bd9Sstevel@tonic-gate 2017c478bd9Sstevel@tonic-gate /* 2027c478bd9Sstevel@tonic-gate * Bookkeeping for latencies seen during probing (used for verification) 2037c478bd9Sstevel@tonic-gate */ 2047c478bd9Sstevel@tonic-gate typedef struct lgrp_plat_latency_acct { 2057c478bd9Sstevel@tonic-gate hrtime_t la_value; /* latency value */ 2067c478bd9Sstevel@tonic-gate int la_count; /* occurrences */ 2077c478bd9Sstevel@tonic-gate } lgrp_plat_latency_acct_t; 2087c478bd9Sstevel@tonic-gate 2097c478bd9Sstevel@tonic-gate 2107c478bd9Sstevel@tonic-gate /* 2117c478bd9Sstevel@tonic-gate * Choices for probing to determine lgroup topology 2127c478bd9Sstevel@tonic-gate */ 2137c478bd9Sstevel@tonic-gate typedef enum lgrp_plat_probe_op { 2147c478bd9Sstevel@tonic-gate LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 2157c478bd9Sstevel@tonic-gate LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 2167c478bd9Sstevel@tonic-gate } lgrp_plat_probe_op_t; 2177c478bd9Sstevel@tonic-gate 2187c478bd9Sstevel@tonic-gate 2197c478bd9Sstevel@tonic-gate /* 2207c478bd9Sstevel@tonic-gate * Opteron DRAM address map gives base and limit for physical memory in a node 2217c478bd9Sstevel@tonic-gate */ 2227c478bd9Sstevel@tonic-gate typedef struct opt_dram_addr_map { 223f78a91cdSjjc uint32_t base_hi; 224f78a91cdSjjc uint32_t base_lo; 225f78a91cdSjjc uint32_t limit_hi; 226f78a91cdSjjc uint32_t limit_lo; 2277c478bd9Sstevel@tonic-gate } opt_dram_addr_map_t; 2287c478bd9Sstevel@tonic-gate 2297c478bd9Sstevel@tonic-gate 2307c478bd9Sstevel@tonic-gate /* 2317c478bd9Sstevel@tonic-gate * Starting and ending page for physical memory in node 2327c478bd9Sstevel@tonic-gate */ 2337c478bd9Sstevel@tonic-gate typedef struct phys_addr_map { 2347c478bd9Sstevel@tonic-gate pfn_t start; 2357c478bd9Sstevel@tonic-gate pfn_t end; 236a940d195Sjjc int exists; 2377c478bd9Sstevel@tonic-gate } phys_addr_map_t; 2387c478bd9Sstevel@tonic-gate 2397c478bd9Sstevel@tonic-gate 2407c478bd9Sstevel@tonic-gate /* 2417c478bd9Sstevel@tonic-gate * Opteron DRAM address map for each node 2427c478bd9Sstevel@tonic-gate */ 2437c478bd9Sstevel@tonic-gate struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 2447c478bd9Sstevel@tonic-gate 2457c478bd9Sstevel@tonic-gate /* 2467c478bd9Sstevel@tonic-gate * Node ID register contents for each node 2477c478bd9Sstevel@tonic-gate */ 2487c478bd9Sstevel@tonic-gate uint_t opt_node_info[MAX_NODES]; 2497c478bd9Sstevel@tonic-gate 2507c478bd9Sstevel@tonic-gate /* 2517c478bd9Sstevel@tonic-gate * Whether memory is interleaved across nodes causing MPO to be disabled 2527c478bd9Sstevel@tonic-gate */ 2537c478bd9Sstevel@tonic-gate int lgrp_plat_mem_intrlv = 0; 2547c478bd9Sstevel@tonic-gate 2557c478bd9Sstevel@tonic-gate /* 2567c478bd9Sstevel@tonic-gate * Number of nodes in system 2577c478bd9Sstevel@tonic-gate */ 2587c478bd9Sstevel@tonic-gate uint_t lgrp_plat_node_cnt = 1; 2597c478bd9Sstevel@tonic-gate 2607c478bd9Sstevel@tonic-gate /* 2617c478bd9Sstevel@tonic-gate * Physical address range for memory in each node 2627c478bd9Sstevel@tonic-gate */ 2637c478bd9Sstevel@tonic-gate phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 2647c478bd9Sstevel@tonic-gate 2657c478bd9Sstevel@tonic-gate /* 2667c478bd9Sstevel@tonic-gate * Probe costs (individual and total) and flush cost 2677c478bd9Sstevel@tonic-gate */ 2687c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_flush_cost = 0; 2697c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_cost = 0; 2707c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_cost_total = 0; 2717c478bd9Sstevel@tonic-gate 2727c478bd9Sstevel@tonic-gate /* 2737c478bd9Sstevel@tonic-gate * Error code for latency adjustment and verification 2747c478bd9Sstevel@tonic-gate */ 2757c478bd9Sstevel@tonic-gate int lgrp_plat_probe_error_code = 0; 2767c478bd9Sstevel@tonic-gate 2777c478bd9Sstevel@tonic-gate /* 2787c478bd9Sstevel@tonic-gate * How much latencies were off from minimum values gotten 2797c478bd9Sstevel@tonic-gate */ 2807c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 2817c478bd9Sstevel@tonic-gate 2827c478bd9Sstevel@tonic-gate /* 2837c478bd9Sstevel@tonic-gate * Unique probe latencies and number of occurrences of each 2847c478bd9Sstevel@tonic-gate */ 2857c478bd9Sstevel@tonic-gate lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 2867c478bd9Sstevel@tonic-gate 2877c478bd9Sstevel@tonic-gate /* 2887c478bd9Sstevel@tonic-gate * Size of memory buffer in each node for probing 2897c478bd9Sstevel@tonic-gate */ 2907c478bd9Sstevel@tonic-gate size_t lgrp_plat_probe_memsize = 0; 2917c478bd9Sstevel@tonic-gate 2927c478bd9Sstevel@tonic-gate /* 2937c478bd9Sstevel@tonic-gate * Virtual address of page in each node for probing 2947c478bd9Sstevel@tonic-gate */ 2957c478bd9Sstevel@tonic-gate caddr_t lgrp_plat_probe_memory[MAX_NODES]; 2967c478bd9Sstevel@tonic-gate 2977c478bd9Sstevel@tonic-gate /* 2987c478bd9Sstevel@tonic-gate * Number of unique latencies in probe times 2997c478bd9Sstevel@tonic-gate */ 3007c478bd9Sstevel@tonic-gate int lgrp_plat_probe_nlatencies = 0; 3017c478bd9Sstevel@tonic-gate 3027c478bd9Sstevel@tonic-gate /* 3037c478bd9Sstevel@tonic-gate * How many rounds of probing to do 3047c478bd9Sstevel@tonic-gate */ 3057c478bd9Sstevel@tonic-gate int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 3067c478bd9Sstevel@tonic-gate 3077c478bd9Sstevel@tonic-gate /* 3087c478bd9Sstevel@tonic-gate * Number of samples to take when probing each node 3097c478bd9Sstevel@tonic-gate */ 3107c478bd9Sstevel@tonic-gate int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 3117c478bd9Sstevel@tonic-gate 3127c478bd9Sstevel@tonic-gate /* 3138949bcd6Sandrei * Number of times to read vendor ID from Northbridge for each probe. 3148949bcd6Sandrei */ 3158949bcd6Sandrei int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 3168949bcd6Sandrei 3178949bcd6Sandrei /* 3187c478bd9Sstevel@tonic-gate * How to probe to determine lgroup topology 3197c478bd9Sstevel@tonic-gate */ 3207c478bd9Sstevel@tonic-gate lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 3217c478bd9Sstevel@tonic-gate 3227c478bd9Sstevel@tonic-gate /* 3237c478bd9Sstevel@tonic-gate * PFN of page in each node for probing 3247c478bd9Sstevel@tonic-gate */ 3257c478bd9Sstevel@tonic-gate pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 3267c478bd9Sstevel@tonic-gate 3277c478bd9Sstevel@tonic-gate /* 3287c478bd9Sstevel@tonic-gate * Whether probe time was suspect (ie. not within tolerance of value that it 3297c478bd9Sstevel@tonic-gate * should match) 3307c478bd9Sstevel@tonic-gate */ 3317c478bd9Sstevel@tonic-gate int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 3327c478bd9Sstevel@tonic-gate 3337c478bd9Sstevel@tonic-gate /* 3347c478bd9Sstevel@tonic-gate * How long it takes to access memory from each node 3357c478bd9Sstevel@tonic-gate */ 3367c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 3377c478bd9Sstevel@tonic-gate 3387c478bd9Sstevel@tonic-gate /* 3397c478bd9Sstevel@tonic-gate * Min and max node memory probe times seen 3407c478bd9Sstevel@tonic-gate */ 3417c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_time_max = 0; 3427c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_time_min = -1; 3437c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 3447c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 3457c478bd9Sstevel@tonic-gate 3467c478bd9Sstevel@tonic-gate 3477c478bd9Sstevel@tonic-gate /* 3487c478bd9Sstevel@tonic-gate * Allocate lgrp and lgrp stat arrays statically. 3497c478bd9Sstevel@tonic-gate */ 3507c478bd9Sstevel@tonic-gate static lgrp_t lgrp_space[NLGRP]; 3517c478bd9Sstevel@tonic-gate static int nlgrps_alloc; 3527c478bd9Sstevel@tonic-gate 3537c478bd9Sstevel@tonic-gate struct lgrp_stats lgrp_stats[NLGRP]; 3547c478bd9Sstevel@tonic-gate 355f78a91cdSjjc /* 356f78a91cdSjjc * Supported AMD processor families 357f78a91cdSjjc */ 358f78a91cdSjjc #define AMD_FAMILY_HAMMER 15 359f78a91cdSjjc #define AMD_FAMILY_GREYHOUND 16 3607c478bd9Sstevel@tonic-gate 361f78a91cdSjjc /* 362f78a91cdSjjc * Whether to have is_opteron() return 1 even when processor isn't 363f78a91cdSjjc * supported 364f78a91cdSjjc */ 365f78a91cdSjjc uint_t is_opteron_override = 0; 366f78a91cdSjjc 367f78a91cdSjjc /* 368f78a91cdSjjc * AMD processor family for current CPU 369f78a91cdSjjc */ 3707c478bd9Sstevel@tonic-gate uint_t opt_family = 0; 371f78a91cdSjjc 3727c478bd9Sstevel@tonic-gate uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 3737c478bd9Sstevel@tonic-gate 3747c478bd9Sstevel@tonic-gate 3757c478bd9Sstevel@tonic-gate /* 376f78a91cdSjjc * Determine whether we're running on a supported AMD Opteron since reading 377f78a91cdSjjc * node count and DRAM address map registers may have different format or 378f78a91cdSjjc * may not be supported in future processor families 3797c478bd9Sstevel@tonic-gate */ 3807c478bd9Sstevel@tonic-gate int 3817c478bd9Sstevel@tonic-gate is_opteron(void) 3827c478bd9Sstevel@tonic-gate { 383f78a91cdSjjc 3847c478bd9Sstevel@tonic-gate if (x86_vendor != X86_VENDOR_AMD) 3857c478bd9Sstevel@tonic-gate return (0); 3867c478bd9Sstevel@tonic-gate 387f78a91cdSjjc opt_family = cpuid_getfamily(CPU); 388f78a91cdSjjc if (opt_family == AMD_FAMILY_HAMMER || 389f78a91cdSjjc opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 3907c478bd9Sstevel@tonic-gate return (1); 3917c478bd9Sstevel@tonic-gate else 3927c478bd9Sstevel@tonic-gate return (0); 3937c478bd9Sstevel@tonic-gate } 3947c478bd9Sstevel@tonic-gate 3957c478bd9Sstevel@tonic-gate int 3967c478bd9Sstevel@tonic-gate plat_lgrphand_to_mem_node(lgrp_handle_t hand) 3977c478bd9Sstevel@tonic-gate { 3987c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 3997c478bd9Sstevel@tonic-gate return (0); 4007c478bd9Sstevel@tonic-gate 4017c478bd9Sstevel@tonic-gate return ((int)hand); 4027c478bd9Sstevel@tonic-gate } 4037c478bd9Sstevel@tonic-gate 4047c478bd9Sstevel@tonic-gate lgrp_handle_t 4057c478bd9Sstevel@tonic-gate plat_mem_node_to_lgrphand(int mnode) 4067c478bd9Sstevel@tonic-gate { 4077c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 4087c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 4097c478bd9Sstevel@tonic-gate 4107c478bd9Sstevel@tonic-gate return ((lgrp_handle_t)mnode); 4117c478bd9Sstevel@tonic-gate } 4127c478bd9Sstevel@tonic-gate 4137c478bd9Sstevel@tonic-gate int 4147c478bd9Sstevel@tonic-gate plat_pfn_to_mem_node(pfn_t pfn) 4157c478bd9Sstevel@tonic-gate { 4167c478bd9Sstevel@tonic-gate int node; 4177c478bd9Sstevel@tonic-gate 4187c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 4197c478bd9Sstevel@tonic-gate return (0); 4207c478bd9Sstevel@tonic-gate 4217c478bd9Sstevel@tonic-gate for (node = 0; node < lgrp_plat_node_cnt; node++) { 422a940d195Sjjc /* 423a940d195Sjjc * Skip nodes with no memory 424a940d195Sjjc */ 425a940d195Sjjc if (!lgrp_plat_node_memory[node].exists) 426a940d195Sjjc continue; 427a940d195Sjjc 4287c478bd9Sstevel@tonic-gate if (pfn >= lgrp_plat_node_memory[node].start && 4297c478bd9Sstevel@tonic-gate pfn <= lgrp_plat_node_memory[node].end) 4307c478bd9Sstevel@tonic-gate return (node); 4317c478bd9Sstevel@tonic-gate } 4327c478bd9Sstevel@tonic-gate 4337c478bd9Sstevel@tonic-gate ASSERT(node < lgrp_plat_node_cnt); 4347c478bd9Sstevel@tonic-gate return (-1); 4357c478bd9Sstevel@tonic-gate } 4367c478bd9Sstevel@tonic-gate 4377c478bd9Sstevel@tonic-gate /* 4387c478bd9Sstevel@tonic-gate * Configure memory nodes for machines with more than one node (ie NUMA) 4397c478bd9Sstevel@tonic-gate */ 4407c478bd9Sstevel@tonic-gate void 4417c478bd9Sstevel@tonic-gate plat_build_mem_nodes(struct memlist *list) 4427c478bd9Sstevel@tonic-gate { 443a940d195Sjjc pfn_t cur_start; /* start addr of subrange */ 444a940d195Sjjc pfn_t cur_end; /* end addr of subrange */ 445a940d195Sjjc pfn_t start; /* start addr of whole range */ 446a940d195Sjjc pfn_t end; /* end addr of whole range */ 4477c478bd9Sstevel@tonic-gate 4487c478bd9Sstevel@tonic-gate /* 4497c478bd9Sstevel@tonic-gate * Boot install lists are arranged <addr, len>, ... 4507c478bd9Sstevel@tonic-gate */ 4517c478bd9Sstevel@tonic-gate while (list) { 4527c478bd9Sstevel@tonic-gate int node; 4537c478bd9Sstevel@tonic-gate 4547c478bd9Sstevel@tonic-gate start = list->address >> PAGESHIFT; 4557c478bd9Sstevel@tonic-gate end = (list->address + list->size - 1) >> PAGESHIFT; 4567c478bd9Sstevel@tonic-gate 4577c478bd9Sstevel@tonic-gate if (start > physmax) { 4587c478bd9Sstevel@tonic-gate list = list->next; 4597c478bd9Sstevel@tonic-gate continue; 4607c478bd9Sstevel@tonic-gate } 4617c478bd9Sstevel@tonic-gate if (end > physmax) 4627c478bd9Sstevel@tonic-gate end = physmax; 4637c478bd9Sstevel@tonic-gate 4647c478bd9Sstevel@tonic-gate /* 4657c478bd9Sstevel@tonic-gate * When there is only one memnode, just add memory to memnode 4667c478bd9Sstevel@tonic-gate */ 4677c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) { 4687c478bd9Sstevel@tonic-gate mem_node_add_slice(start, end); 4697c478bd9Sstevel@tonic-gate list = list->next; 4707c478bd9Sstevel@tonic-gate continue; 4717c478bd9Sstevel@tonic-gate } 4727c478bd9Sstevel@tonic-gate 4737c478bd9Sstevel@tonic-gate /* 4747c478bd9Sstevel@tonic-gate * mem_node_add_slice() expects to get a memory range that 4757c478bd9Sstevel@tonic-gate * is within one memnode, so need to split any memory range 4767c478bd9Sstevel@tonic-gate * that spans multiple memnodes into subranges that are each 4777c478bd9Sstevel@tonic-gate * contained within one memnode when feeding them to 4787c478bd9Sstevel@tonic-gate * mem_node_add_slice() 4797c478bd9Sstevel@tonic-gate */ 4807c478bd9Sstevel@tonic-gate cur_start = start; 4817c478bd9Sstevel@tonic-gate do { 4827c478bd9Sstevel@tonic-gate node = plat_pfn_to_mem_node(cur_start); 4837c478bd9Sstevel@tonic-gate 484a940d195Sjjc /* 485a940d195Sjjc * Panic if DRAM address map registers or SRAT say 486a940d195Sjjc * memory in node doesn't exist or address from 487a940d195Sjjc * boot installed memory list entry isn't in this node. 488a940d195Sjjc * This shouldn't happen and rest of code can't deal 489a940d195Sjjc * with this if it does. 490a940d195Sjjc */ 491a940d195Sjjc if (node < 0 || node >= lgrp_plat_node_cnt || 492a940d195Sjjc !lgrp_plat_node_memory[node].exists || 493a940d195Sjjc cur_start < lgrp_plat_node_memory[node].start || 494a940d195Sjjc cur_start > lgrp_plat_node_memory[node].end) { 495a940d195Sjjc cmn_err(CE_PANIC, "Don't know which memnode " 496a940d195Sjjc "to add installed memory address 0x%lx\n", 497a940d195Sjjc cur_start); 498a940d195Sjjc } 4997c478bd9Sstevel@tonic-gate 5007c478bd9Sstevel@tonic-gate /* 5017c478bd9Sstevel@tonic-gate * End of current subrange should not span memnodes 5027c478bd9Sstevel@tonic-gate */ 503a940d195Sjjc cur_end = end; 504a940d195Sjjc if (lgrp_plat_node_memory[node].exists && 505a940d195Sjjc cur_end > lgrp_plat_node_memory[node].end) 5067c478bd9Sstevel@tonic-gate cur_end = lgrp_plat_node_memory[node].end; 5077c478bd9Sstevel@tonic-gate 5087c478bd9Sstevel@tonic-gate mem_node_add_slice(cur_start, cur_end); 5097c478bd9Sstevel@tonic-gate 5107c478bd9Sstevel@tonic-gate /* 5117c478bd9Sstevel@tonic-gate * Next subrange starts after end of current one 5127c478bd9Sstevel@tonic-gate */ 5137c478bd9Sstevel@tonic-gate cur_start = cur_end + 1; 5147c478bd9Sstevel@tonic-gate } while (cur_end < end); 5157c478bd9Sstevel@tonic-gate 5167c478bd9Sstevel@tonic-gate list = list->next; 5177c478bd9Sstevel@tonic-gate } 5187c478bd9Sstevel@tonic-gate mem_node_physalign = 0; 5197c478bd9Sstevel@tonic-gate mem_node_pfn_shift = 0; 5207c478bd9Sstevel@tonic-gate } 5217c478bd9Sstevel@tonic-gate 5227c478bd9Sstevel@tonic-gate 5237c478bd9Sstevel@tonic-gate /* 5247c478bd9Sstevel@tonic-gate * Platform-specific initialization of lgroups 5257c478bd9Sstevel@tonic-gate */ 5267c478bd9Sstevel@tonic-gate void 5277c478bd9Sstevel@tonic-gate lgrp_plat_init(void) 5287c478bd9Sstevel@tonic-gate { 529*843e1988Sjohnlev #if defined(__xpv) 530*843e1988Sjohnlev /* 531*843e1988Sjohnlev * XXPV For now, the hypervisor treats all memory equally. 532*843e1988Sjohnlev */ 533*843e1988Sjohnlev lgrp_plat_node_cnt = max_mem_nodes = 1; 534*843e1988Sjohnlev #else /* __xpv */ 5357c478bd9Sstevel@tonic-gate uint_t bus; 5367c478bd9Sstevel@tonic-gate uint_t dev; 5377c478bd9Sstevel@tonic-gate uint_t node; 538f78a91cdSjjc uint_t off_hi; 539f78a91cdSjjc uint_t off_lo; 540f78a91cdSjjc uint64_t nb_cfg_reg; 5417c478bd9Sstevel@tonic-gate 5427c478bd9Sstevel@tonic-gate extern lgrp_load_t lgrp_expand_proc_thresh; 5437c478bd9Sstevel@tonic-gate extern lgrp_load_t lgrp_expand_proc_diff; 5447c478bd9Sstevel@tonic-gate 5457c478bd9Sstevel@tonic-gate /* 5467c478bd9Sstevel@tonic-gate * Initialize as a UMA machine if this isn't an Opteron 5477c478bd9Sstevel@tonic-gate */ 5487c478bd9Sstevel@tonic-gate if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 5497c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt = max_mem_nodes = 1; 5507c478bd9Sstevel@tonic-gate return; 5517c478bd9Sstevel@tonic-gate } 5527c478bd9Sstevel@tonic-gate 5537c478bd9Sstevel@tonic-gate /* 5547c478bd9Sstevel@tonic-gate * Read configuration registers from PCI configuration space to 5557c478bd9Sstevel@tonic-gate * determine node information, which memory is in each node, etc. 5567c478bd9Sstevel@tonic-gate * 5577c478bd9Sstevel@tonic-gate * Write to PCI configuration space address register to specify 5587c478bd9Sstevel@tonic-gate * which configuration register to read and read/write PCI 5597c478bd9Sstevel@tonic-gate * configuration space data register to get/set contents 5607c478bd9Sstevel@tonic-gate */ 5617c478bd9Sstevel@tonic-gate bus = OPT_PCS_BUS_CONFIG; 5627c478bd9Sstevel@tonic-gate dev = OPT_PCS_DEV_NODE0; 563f78a91cdSjjc off_hi = OPT_PCS_OFF_DRAMBASE_HI; 564f78a91cdSjjc off_lo = OPT_PCS_OFF_DRAMBASE_LO; 5657c478bd9Sstevel@tonic-gate 5667c478bd9Sstevel@tonic-gate /* 5677c478bd9Sstevel@tonic-gate * Read node ID register for node 0 to get node count 5687c478bd9Sstevel@tonic-gate */ 569ef50d8c0Sesaxe opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 570ef50d8c0Sesaxe OPT_PCS_OFF_NODEID); 5717c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 5727c478bd9Sstevel@tonic-gate 573f78a91cdSjjc /* 574f78a91cdSjjc * For Greyhound, PCI Extended Configuration Space must be enabled to 575f78a91cdSjjc * read high DRAM address map base and limit registers 576f78a91cdSjjc */ 577f78a91cdSjjc if (opt_family == AMD_FAMILY_GREYHOUND) { 578f78a91cdSjjc nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 579f78a91cdSjjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 580f78a91cdSjjc wrmsr(MSR_AMD_NB_CFG, 581f78a91cdSjjc nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 582f78a91cdSjjc } 583f78a91cdSjjc 5847c478bd9Sstevel@tonic-gate for (node = 0; node < lgrp_plat_node_cnt; node++) { 585f78a91cdSjjc uint32_t base_hi; 586f78a91cdSjjc uint32_t base_lo; 587f78a91cdSjjc uint32_t limit_hi; 588f78a91cdSjjc uint32_t limit_lo; 589f78a91cdSjjc 5907c478bd9Sstevel@tonic-gate /* 5917c478bd9Sstevel@tonic-gate * Read node ID register (except for node 0 which we just read) 5927c478bd9Sstevel@tonic-gate */ 5937c478bd9Sstevel@tonic-gate if (node > 0) { 594ef50d8c0Sesaxe opt_node_info[node] = pci_getl_func(bus, dev, 595ef50d8c0Sesaxe OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 5967c478bd9Sstevel@tonic-gate } 5977c478bd9Sstevel@tonic-gate 5987c478bd9Sstevel@tonic-gate /* 5997c478bd9Sstevel@tonic-gate * Read DRAM base and limit registers which specify 6007c478bd9Sstevel@tonic-gate * physical memory range of each node 6017c478bd9Sstevel@tonic-gate */ 602f78a91cdSjjc if (opt_family != AMD_FAMILY_GREYHOUND) 603f78a91cdSjjc base_hi = 0; 604f78a91cdSjjc else { 605f78a91cdSjjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 606f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 607f78a91cdSjjc base_hi = opt_dram_map[node].base_hi = 608f78a91cdSjjc inl(PCI_CONFDATA); 609f78a91cdSjjc } 610f78a91cdSjjc base_lo = opt_dram_map[node].base_lo = pci_getl_func(bus, dev, 611f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_lo); 612f78a91cdSjjc 613f78a91cdSjjc if (opt_dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) 6147c478bd9Sstevel@tonic-gate lgrp_plat_mem_intrlv++; 6157c478bd9Sstevel@tonic-gate 616f78a91cdSjjc off_hi += 4; /* high limit register offset */ 617f78a91cdSjjc if (opt_family != AMD_FAMILY_GREYHOUND) 618f78a91cdSjjc limit_hi = 0; 619f78a91cdSjjc else { 620f78a91cdSjjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 621f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 622f78a91cdSjjc limit_hi = opt_dram_map[node].limit_hi = 623f78a91cdSjjc inl(PCI_CONFDATA); 624f78a91cdSjjc } 625f78a91cdSjjc 626f78a91cdSjjc off_lo += 4; /* low limit register offset */ 627f78a91cdSjjc limit_lo = opt_dram_map[node].limit_lo = pci_getl_func(bus, 628f78a91cdSjjc dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 6297c478bd9Sstevel@tonic-gate 6307c478bd9Sstevel@tonic-gate /* 631f78a91cdSjjc * Increment device number to next node and register offsets 632f78a91cdSjjc * for DRAM base register of next node 6337c478bd9Sstevel@tonic-gate */ 634f78a91cdSjjc off_hi += 4; 635f78a91cdSjjc off_lo += 4; 6367c478bd9Sstevel@tonic-gate dev++; 6377c478bd9Sstevel@tonic-gate 6387c478bd9Sstevel@tonic-gate /* 639a940d195Sjjc * Both read and write enable bits must be enabled in DRAM 640a940d195Sjjc * address map base register for physical memory to exist in 641a940d195Sjjc * node 642a940d195Sjjc */ 643f78a91cdSjjc if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 644f78a91cdSjjc (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 645a940d195Sjjc /* 646a940d195Sjjc * Mark node memory as non-existent and set start and 647a940d195Sjjc * end addresses to be same in lgrp_plat_node_memory[] 648a940d195Sjjc */ 649a940d195Sjjc lgrp_plat_node_memory[node].exists = 0; 650a940d195Sjjc lgrp_plat_node_memory[node].start = 651a940d195Sjjc lgrp_plat_node_memory[node].end = (pfn_t)-1; 652a940d195Sjjc continue; 653a940d195Sjjc } 654a940d195Sjjc 655a940d195Sjjc /* 6567c478bd9Sstevel@tonic-gate * Get PFN for first page in each node, 6577c478bd9Sstevel@tonic-gate * so we can probe memory to determine latency topology 6587c478bd9Sstevel@tonic-gate */ 6597c478bd9Sstevel@tonic-gate lgrp_plat_probe_pfn[node] = 660f78a91cdSjjc btop(OPT_DRAMADDR(base_hi, base_lo)); 6617c478bd9Sstevel@tonic-gate 6627c478bd9Sstevel@tonic-gate /* 663a940d195Sjjc * Mark node memory as existing and remember physical address 664a940d195Sjjc * range of each node for use later 6657c478bd9Sstevel@tonic-gate */ 666a940d195Sjjc lgrp_plat_node_memory[node].exists = 1; 667f78a91cdSjjc 6687c478bd9Sstevel@tonic-gate lgrp_plat_node_memory[node].start = 669f78a91cdSjjc btop(OPT_DRAMADDR(base_hi, base_lo)); 670f78a91cdSjjc 6717c478bd9Sstevel@tonic-gate lgrp_plat_node_memory[node].end = 672f78a91cdSjjc btop(OPT_DRAMADDR(limit_hi, limit_lo) | 673f78a91cdSjjc OPT_DRAMADDR_LO_MASK_OFF); 674f78a91cdSjjc } 675f78a91cdSjjc 676f78a91cdSjjc /* 677f78a91cdSjjc * Restore PCI Extended Configuration Space enable bit 678f78a91cdSjjc */ 679f78a91cdSjjc if (opt_family == AMD_FAMILY_GREYHOUND) { 680f78a91cdSjjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 681f78a91cdSjjc wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 6827c478bd9Sstevel@tonic-gate } 6837c478bd9Sstevel@tonic-gate 6847c478bd9Sstevel@tonic-gate /* 6857c478bd9Sstevel@tonic-gate * Only use one memory node if memory is interleaved between any nodes 6867c478bd9Sstevel@tonic-gate */ 6877c478bd9Sstevel@tonic-gate if (lgrp_plat_mem_intrlv) { 6887c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt = max_mem_nodes = 1; 6897c478bd9Sstevel@tonic-gate (void) lgrp_topo_ht_limit_set(1); 6907c478bd9Sstevel@tonic-gate } else { 6917c478bd9Sstevel@tonic-gate max_mem_nodes = lgrp_plat_node_cnt; 6927c478bd9Sstevel@tonic-gate 6937c478bd9Sstevel@tonic-gate /* 6947c478bd9Sstevel@tonic-gate * Probing errors can mess up the lgroup topology and force us 6957c478bd9Sstevel@tonic-gate * fall back to a 2 level lgroup topology. Here we bound how 6967c478bd9Sstevel@tonic-gate * tall the lgroup topology can grow in hopes of avoiding any 6977c478bd9Sstevel@tonic-gate * anamolies in probing from messing up the lgroup topology 6987c478bd9Sstevel@tonic-gate * by limiting the accuracy of the latency topology. 6997c478bd9Sstevel@tonic-gate * 7007c478bd9Sstevel@tonic-gate * Assume that nodes will at least be configured in a ring, 7017c478bd9Sstevel@tonic-gate * so limit height of lgroup topology to be less than number 7027c478bd9Sstevel@tonic-gate * of nodes on a system with 4 or more nodes 7037c478bd9Sstevel@tonic-gate */ 7047c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt >= 4 && 7057c478bd9Sstevel@tonic-gate lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 7067c478bd9Sstevel@tonic-gate (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 7077c478bd9Sstevel@tonic-gate } 7087c478bd9Sstevel@tonic-gate 7097c478bd9Sstevel@tonic-gate /* 7107c478bd9Sstevel@tonic-gate * Lgroups on Opteron architectures have but a single physical 7117c478bd9Sstevel@tonic-gate * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 7127c478bd9Sstevel@tonic-gate * so that lgrp_choose() will spread things out aggressively. 7137c478bd9Sstevel@tonic-gate */ 7147c478bd9Sstevel@tonic-gate lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 7157c478bd9Sstevel@tonic-gate lgrp_expand_proc_diff = 0; 716*843e1988Sjohnlev #endif /* __xpv */ 7177c478bd9Sstevel@tonic-gate } 7187c478bd9Sstevel@tonic-gate 7197c478bd9Sstevel@tonic-gate 7207c478bd9Sstevel@tonic-gate /* 7217c478bd9Sstevel@tonic-gate * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 7227c478bd9Sstevel@tonic-gate * be considered same 7237c478bd9Sstevel@tonic-gate */ 7247c478bd9Sstevel@tonic-gate #define LGRP_LAT_TOLERANCE_SHIFT 4 7257c478bd9Sstevel@tonic-gate 7267c478bd9Sstevel@tonic-gate int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 7277c478bd9Sstevel@tonic-gate 7287c478bd9Sstevel@tonic-gate 7297c478bd9Sstevel@tonic-gate /* 7307c478bd9Sstevel@tonic-gate * Adjust latencies between nodes to be symmetric, normalize latencies between 7317c478bd9Sstevel@tonic-gate * any nodes that are within some tolerance to be same, and make local 7327c478bd9Sstevel@tonic-gate * latencies be same 7337c478bd9Sstevel@tonic-gate */ 7347c478bd9Sstevel@tonic-gate static void 7357c478bd9Sstevel@tonic-gate lgrp_plat_latency_adjust(void) 7367c478bd9Sstevel@tonic-gate { 7377c478bd9Sstevel@tonic-gate int i; 7387c478bd9Sstevel@tonic-gate int j; 7397c478bd9Sstevel@tonic-gate int k; 7407c478bd9Sstevel@tonic-gate int l; 7417c478bd9Sstevel@tonic-gate u_longlong_t max; 7427c478bd9Sstevel@tonic-gate u_longlong_t min; 7437c478bd9Sstevel@tonic-gate u_longlong_t t; 7447c478bd9Sstevel@tonic-gate u_longlong_t t1; 7457c478bd9Sstevel@tonic-gate u_longlong_t t2; 74603400a71Sjjc const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 7477c478bd9Sstevel@tonic-gate int lat_corrected[MAX_NODES][MAX_NODES]; 7487c478bd9Sstevel@tonic-gate 7497c478bd9Sstevel@tonic-gate /* 7507c478bd9Sstevel@tonic-gate * Nothing to do when this is an UMA machine 7517c478bd9Sstevel@tonic-gate */ 7527c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 7537c478bd9Sstevel@tonic-gate return; 7547c478bd9Sstevel@tonic-gate 7557c478bd9Sstevel@tonic-gate /* 7567c478bd9Sstevel@tonic-gate * Make sure that latencies are symmetric between any two nodes 7577c478bd9Sstevel@tonic-gate * (ie. latency(node0, node1) == latency(node1, node0)) 7587c478bd9Sstevel@tonic-gate */ 7597c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 7607c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 7617c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 7627c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[j][i]; 7637c478bd9Sstevel@tonic-gate 7647c478bd9Sstevel@tonic-gate if (t1 == 0 || t2 == 0 || t1 == t2) 7657c478bd9Sstevel@tonic-gate continue; 7667c478bd9Sstevel@tonic-gate 7677c478bd9Sstevel@tonic-gate /* 7687c478bd9Sstevel@tonic-gate * Latencies should be same 7697c478bd9Sstevel@tonic-gate * - Use minimum of two latencies which should be same 7707c478bd9Sstevel@tonic-gate * - Track suspect probe times not within tolerance of 7717c478bd9Sstevel@tonic-gate * min value 7727c478bd9Sstevel@tonic-gate * - Remember how much values are corrected by 7737c478bd9Sstevel@tonic-gate */ 7747c478bd9Sstevel@tonic-gate if (t1 > t2) { 7757c478bd9Sstevel@tonic-gate t = t2; 7767c478bd9Sstevel@tonic-gate lgrp_plat_probe_errors[i][j] += t1 - t2; 7777c478bd9Sstevel@tonic-gate if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 7787c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[i][j]++; 7797c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[j][i]++; 7807c478bd9Sstevel@tonic-gate } 7817c478bd9Sstevel@tonic-gate } else if (t2 > t1) { 7827c478bd9Sstevel@tonic-gate t = t1; 7837c478bd9Sstevel@tonic-gate lgrp_plat_probe_errors[j][i] += t2 - t1; 7847c478bd9Sstevel@tonic-gate if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 7857c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[i][j]++; 7867c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[j][i]++; 7877c478bd9Sstevel@tonic-gate } 7887c478bd9Sstevel@tonic-gate } 7897c478bd9Sstevel@tonic-gate 7907c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 7917c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[j][i] = t; 7927c478bd9Sstevel@tonic-gate lgrp_config(cflag, t1, t); 7937c478bd9Sstevel@tonic-gate lgrp_config(cflag, t2, t); 7947c478bd9Sstevel@tonic-gate } 7957c478bd9Sstevel@tonic-gate 7967c478bd9Sstevel@tonic-gate /* 7977c478bd9Sstevel@tonic-gate * Keep track of which latencies get corrected 7987c478bd9Sstevel@tonic-gate */ 7997c478bd9Sstevel@tonic-gate for (i = 0; i < MAX_NODES; i++) 8007c478bd9Sstevel@tonic-gate for (j = 0; j < MAX_NODES; j++) 8017c478bd9Sstevel@tonic-gate lat_corrected[i][j] = 0; 8027c478bd9Sstevel@tonic-gate 8037c478bd9Sstevel@tonic-gate /* 8047c478bd9Sstevel@tonic-gate * For every two nodes, see whether there is another pair of nodes which 8057c478bd9Sstevel@tonic-gate * are about the same distance apart and make the latencies be the same 8067c478bd9Sstevel@tonic-gate * if they are close enough together 8077c478bd9Sstevel@tonic-gate */ 8087c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 8097c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 8107c478bd9Sstevel@tonic-gate /* 8117c478bd9Sstevel@tonic-gate * Pick one pair of nodes (i, j) 8127c478bd9Sstevel@tonic-gate * and get latency between them 8137c478bd9Sstevel@tonic-gate */ 8147c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 8157c478bd9Sstevel@tonic-gate 8167c478bd9Sstevel@tonic-gate /* 8177c478bd9Sstevel@tonic-gate * Skip this pair of nodes if there isn't a latency 8187c478bd9Sstevel@tonic-gate * for it yet 8197c478bd9Sstevel@tonic-gate */ 8207c478bd9Sstevel@tonic-gate if (t1 == 0) 8217c478bd9Sstevel@tonic-gate continue; 8227c478bd9Sstevel@tonic-gate 8237c478bd9Sstevel@tonic-gate for (k = 0; k < lgrp_plat_node_cnt; k++) 8247c478bd9Sstevel@tonic-gate for (l = 0; l < lgrp_plat_node_cnt; l++) { 8257c478bd9Sstevel@tonic-gate /* 8267c478bd9Sstevel@tonic-gate * Pick another pair of nodes (k, l) 8277c478bd9Sstevel@tonic-gate * not same as (i, j) and get latency 8287c478bd9Sstevel@tonic-gate * between them 8297c478bd9Sstevel@tonic-gate */ 8307c478bd9Sstevel@tonic-gate if (k == i && l == j) 8317c478bd9Sstevel@tonic-gate continue; 8327c478bd9Sstevel@tonic-gate 8337c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[k][l]; 8347c478bd9Sstevel@tonic-gate 8357c478bd9Sstevel@tonic-gate /* 8367c478bd9Sstevel@tonic-gate * Skip this pair of nodes if there 8377c478bd9Sstevel@tonic-gate * isn't a latency for it yet 8387c478bd9Sstevel@tonic-gate */ 8397c478bd9Sstevel@tonic-gate 8407c478bd9Sstevel@tonic-gate if (t2 == 0) 8417c478bd9Sstevel@tonic-gate continue; 8427c478bd9Sstevel@tonic-gate 8437c478bd9Sstevel@tonic-gate /* 8447c478bd9Sstevel@tonic-gate * Skip nodes (k, l) if they already 8457c478bd9Sstevel@tonic-gate * have same latency as (i, j) or 8467c478bd9Sstevel@tonic-gate * their latency isn't close enough to 8477c478bd9Sstevel@tonic-gate * be considered/made the same 8487c478bd9Sstevel@tonic-gate */ 8497c478bd9Sstevel@tonic-gate if (t1 == t2 || (t1 > t2 && t1 - t2 > 8507c478bd9Sstevel@tonic-gate t1 >> lgrp_plat_probe_lt_shift) || 8517c478bd9Sstevel@tonic-gate (t2 > t1 && t2 - t1 > 8527c478bd9Sstevel@tonic-gate t2 >> lgrp_plat_probe_lt_shift)) 8537c478bd9Sstevel@tonic-gate continue; 8547c478bd9Sstevel@tonic-gate 8557c478bd9Sstevel@tonic-gate /* 8567c478bd9Sstevel@tonic-gate * Make latency(i, j) same as 8577c478bd9Sstevel@tonic-gate * latency(k, l), try to use latency 8587c478bd9Sstevel@tonic-gate * that has been adjusted already to get 8597c478bd9Sstevel@tonic-gate * more consistency (if possible), and 8607c478bd9Sstevel@tonic-gate * remember which latencies were 8617c478bd9Sstevel@tonic-gate * adjusted for next time 8627c478bd9Sstevel@tonic-gate */ 8637c478bd9Sstevel@tonic-gate if (lat_corrected[i][j]) { 8647c478bd9Sstevel@tonic-gate t = t1; 8657c478bd9Sstevel@tonic-gate lgrp_config(cflag, t2, t); 8667c478bd9Sstevel@tonic-gate t2 = t; 8677c478bd9Sstevel@tonic-gate } else if (lat_corrected[k][l]) { 8687c478bd9Sstevel@tonic-gate t = t2; 8697c478bd9Sstevel@tonic-gate lgrp_config(cflag, t1, t); 8707c478bd9Sstevel@tonic-gate t1 = t; 8717c478bd9Sstevel@tonic-gate } else { 8727c478bd9Sstevel@tonic-gate if (t1 > t2) 8737c478bd9Sstevel@tonic-gate t = t2; 8747c478bd9Sstevel@tonic-gate else 8757c478bd9Sstevel@tonic-gate t = t1; 8767c478bd9Sstevel@tonic-gate lgrp_config(cflag, t1, t); 8777c478bd9Sstevel@tonic-gate lgrp_config(cflag, t2, t); 8787c478bd9Sstevel@tonic-gate t1 = t2 = t; 8797c478bd9Sstevel@tonic-gate } 8807c478bd9Sstevel@tonic-gate 8817c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 8827c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[k][l] = t; 8837c478bd9Sstevel@tonic-gate 8847c478bd9Sstevel@tonic-gate lat_corrected[i][j] = 8857c478bd9Sstevel@tonic-gate lat_corrected[k][l] = 1; 8867c478bd9Sstevel@tonic-gate } 8877c478bd9Sstevel@tonic-gate } 8887c478bd9Sstevel@tonic-gate 8897c478bd9Sstevel@tonic-gate /* 8907c478bd9Sstevel@tonic-gate * Local latencies should be same 8917c478bd9Sstevel@tonic-gate * - Find min and max local latencies 8927c478bd9Sstevel@tonic-gate * - Make all local latencies be minimum 8937c478bd9Sstevel@tonic-gate */ 8947c478bd9Sstevel@tonic-gate min = -1; 8957c478bd9Sstevel@tonic-gate max = 0; 8967c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 8977c478bd9Sstevel@tonic-gate t = lgrp_plat_probe_times[i][i]; 8987c478bd9Sstevel@tonic-gate if (t == 0) 8997c478bd9Sstevel@tonic-gate continue; 9007c478bd9Sstevel@tonic-gate if (min == -1 || t < min) 9017c478bd9Sstevel@tonic-gate min = t; 9027c478bd9Sstevel@tonic-gate if (t > max) 9037c478bd9Sstevel@tonic-gate max = t; 9047c478bd9Sstevel@tonic-gate } 9057c478bd9Sstevel@tonic-gate if (min != max) { 9067c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 9077c478bd9Sstevel@tonic-gate int local; 9087c478bd9Sstevel@tonic-gate 9097c478bd9Sstevel@tonic-gate local = lgrp_plat_probe_times[i][i]; 9107c478bd9Sstevel@tonic-gate if (local == 0) 9117c478bd9Sstevel@tonic-gate continue; 9127c478bd9Sstevel@tonic-gate 9137c478bd9Sstevel@tonic-gate /* 9147c478bd9Sstevel@tonic-gate * Track suspect probe times that aren't within 9157c478bd9Sstevel@tonic-gate * tolerance of minimum local latency and how much 9167c478bd9Sstevel@tonic-gate * probe times are corrected by 9177c478bd9Sstevel@tonic-gate */ 9187c478bd9Sstevel@tonic-gate if (local - min > min >> lgrp_plat_probe_lt_shift) 9197c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[i][i]++; 9207c478bd9Sstevel@tonic-gate 9217c478bd9Sstevel@tonic-gate lgrp_plat_probe_errors[i][i] += local - min; 9227c478bd9Sstevel@tonic-gate 9237c478bd9Sstevel@tonic-gate /* 9247c478bd9Sstevel@tonic-gate * Make local latencies be minimum 9257c478bd9Sstevel@tonic-gate */ 92603400a71Sjjc lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 9277c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][i] = min; 9287c478bd9Sstevel@tonic-gate } 9297c478bd9Sstevel@tonic-gate } 9307c478bd9Sstevel@tonic-gate 9317c478bd9Sstevel@tonic-gate /* 9327c478bd9Sstevel@tonic-gate * Determine max probe time again since just adjusted latencies 9337c478bd9Sstevel@tonic-gate */ 9347c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = 0; 9357c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 9367c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 9377c478bd9Sstevel@tonic-gate t = lgrp_plat_probe_times[i][j]; 9387c478bd9Sstevel@tonic-gate if (t > lgrp_plat_probe_time_max) 9397c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = t; 9407c478bd9Sstevel@tonic-gate } 9417c478bd9Sstevel@tonic-gate } 9427c478bd9Sstevel@tonic-gate 9437c478bd9Sstevel@tonic-gate 9447c478bd9Sstevel@tonic-gate /* 9457c478bd9Sstevel@tonic-gate * Verify following about latencies between nodes: 9467c478bd9Sstevel@tonic-gate * 9477c478bd9Sstevel@tonic-gate * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 9487c478bd9Sstevel@tonic-gate * - Local latencies same 9497c478bd9Sstevel@tonic-gate * - Local < remote 9507c478bd9Sstevel@tonic-gate * - Number of latencies seen is reasonable 9517c478bd9Sstevel@tonic-gate * - Number of occurrences of a given latency should be more than 1 9527c478bd9Sstevel@tonic-gate * 9537c478bd9Sstevel@tonic-gate * Returns: 9547c478bd9Sstevel@tonic-gate * 0 Success 9557c478bd9Sstevel@tonic-gate * -1 Not symmetric 9567c478bd9Sstevel@tonic-gate * -2 Local latencies not same 9577c478bd9Sstevel@tonic-gate * -3 Local >= remote 9587c478bd9Sstevel@tonic-gate * -4 Wrong number of latencies 9597c478bd9Sstevel@tonic-gate * -5 Not enough occurrences of given latency 9607c478bd9Sstevel@tonic-gate */ 9617c478bd9Sstevel@tonic-gate static int 9627c478bd9Sstevel@tonic-gate lgrp_plat_latency_verify(void) 9637c478bd9Sstevel@tonic-gate { 9647c478bd9Sstevel@tonic-gate int i; 9657c478bd9Sstevel@tonic-gate int j; 9667c478bd9Sstevel@tonic-gate lgrp_plat_latency_acct_t *l; 9677c478bd9Sstevel@tonic-gate int probed; 9687c478bd9Sstevel@tonic-gate u_longlong_t t1; 9697c478bd9Sstevel@tonic-gate u_longlong_t t2; 9707c478bd9Sstevel@tonic-gate 9717c478bd9Sstevel@tonic-gate /* 9722dae3fb5Sjjc * Nothing to do when this is an UMA machine, lgroup topology is 9732dae3fb5Sjjc * limited to 2 levels, or there aren't any probe times yet 9747c478bd9Sstevel@tonic-gate */ 9757c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 9762dae3fb5Sjjc (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 9777c478bd9Sstevel@tonic-gate return (0); 9787c478bd9Sstevel@tonic-gate 9797c478bd9Sstevel@tonic-gate /* 9807c478bd9Sstevel@tonic-gate * Make sure that latencies are symmetric between any two nodes 9817c478bd9Sstevel@tonic-gate * (ie. latency(node0, node1) == latency(node1, node0)) 9827c478bd9Sstevel@tonic-gate */ 9837c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 9847c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 9857c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 9867c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[j][i]; 9877c478bd9Sstevel@tonic-gate 9887c478bd9Sstevel@tonic-gate if (t1 == 0 || t2 == 0 || t1 == t2) 9897c478bd9Sstevel@tonic-gate continue; 9907c478bd9Sstevel@tonic-gate 9917c478bd9Sstevel@tonic-gate return (-1); 9927c478bd9Sstevel@tonic-gate } 9937c478bd9Sstevel@tonic-gate 9947c478bd9Sstevel@tonic-gate /* 9957c478bd9Sstevel@tonic-gate * Local latencies should be same 9967c478bd9Sstevel@tonic-gate */ 9977c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[0][0]; 9987c478bd9Sstevel@tonic-gate for (i = 1; i < lgrp_plat_node_cnt; i++) { 9997c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[i][i]; 10007c478bd9Sstevel@tonic-gate if (t2 == 0) 10017c478bd9Sstevel@tonic-gate continue; 10027c478bd9Sstevel@tonic-gate 10032dae3fb5Sjjc if (t1 == 0) { 10042dae3fb5Sjjc t1 = t2; 10052dae3fb5Sjjc continue; 10062dae3fb5Sjjc } 10072dae3fb5Sjjc 10087c478bd9Sstevel@tonic-gate if (t1 != t2) 10097c478bd9Sstevel@tonic-gate return (-2); 10107c478bd9Sstevel@tonic-gate } 10117c478bd9Sstevel@tonic-gate 10127c478bd9Sstevel@tonic-gate /* 10137c478bd9Sstevel@tonic-gate * Local latencies should be less than remote 10147c478bd9Sstevel@tonic-gate */ 10152dae3fb5Sjjc if (t1) { 10167c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 10177c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 10182dae3fb5Sjjc t2 = lgrp_plat_probe_times[i][j]; 10197c478bd9Sstevel@tonic-gate if (i == j || t2 == 0) 10207c478bd9Sstevel@tonic-gate continue; 10217c478bd9Sstevel@tonic-gate 10227c478bd9Sstevel@tonic-gate if (t1 >= t2) 10237c478bd9Sstevel@tonic-gate return (-3); 10247c478bd9Sstevel@tonic-gate } 10252dae3fb5Sjjc } 10267c478bd9Sstevel@tonic-gate 10277c478bd9Sstevel@tonic-gate /* 10287c478bd9Sstevel@tonic-gate * Rest of checks are not very useful for machines with less than 10297c478bd9Sstevel@tonic-gate * 4 nodes (which means less than 3 latencies on Opteron) 10307c478bd9Sstevel@tonic-gate */ 10317c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt < 4) 10327c478bd9Sstevel@tonic-gate return (0); 10337c478bd9Sstevel@tonic-gate 10347c478bd9Sstevel@tonic-gate /* 10357c478bd9Sstevel@tonic-gate * Need to see whether done probing in order to verify number of 10367c478bd9Sstevel@tonic-gate * latencies are correct 10377c478bd9Sstevel@tonic-gate */ 10387c478bd9Sstevel@tonic-gate probed = 0; 10397c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 10407c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[i][i]) 10417c478bd9Sstevel@tonic-gate probed++; 10427c478bd9Sstevel@tonic-gate 10437c478bd9Sstevel@tonic-gate if (probed != lgrp_plat_node_cnt) 10447c478bd9Sstevel@tonic-gate return (0); 10457c478bd9Sstevel@tonic-gate 10467c478bd9Sstevel@tonic-gate /* 10477c478bd9Sstevel@tonic-gate * Determine number of unique latencies seen in probe times, 10487c478bd9Sstevel@tonic-gate * their values, and number of occurrences of each 10497c478bd9Sstevel@tonic-gate */ 10507c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies = 0; 10517c478bd9Sstevel@tonic-gate bzero(lgrp_plat_probe_lat_acct, 10527c478bd9Sstevel@tonic-gate MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 10537c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 10547c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 10557c478bd9Sstevel@tonic-gate int k; 10567c478bd9Sstevel@tonic-gate 10577c478bd9Sstevel@tonic-gate /* 10587c478bd9Sstevel@tonic-gate * Look at each probe time 10597c478bd9Sstevel@tonic-gate */ 10607c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 10617c478bd9Sstevel@tonic-gate if (t1 == 0) 10627c478bd9Sstevel@tonic-gate continue; 10637c478bd9Sstevel@tonic-gate 10647c478bd9Sstevel@tonic-gate /* 10657c478bd9Sstevel@tonic-gate * Account for unique latencies 10667c478bd9Sstevel@tonic-gate */ 10677c478bd9Sstevel@tonic-gate for (k = 0; k < lgrp_plat_node_cnt; k++) { 10687c478bd9Sstevel@tonic-gate l = &lgrp_plat_probe_lat_acct[k]; 10697c478bd9Sstevel@tonic-gate if (t1 == l->la_value) { 10707c478bd9Sstevel@tonic-gate /* 10717c478bd9Sstevel@tonic-gate * Increment number of occurrences 10727c478bd9Sstevel@tonic-gate * if seen before 10737c478bd9Sstevel@tonic-gate */ 10747c478bd9Sstevel@tonic-gate l->la_count++; 10757c478bd9Sstevel@tonic-gate break; 10767c478bd9Sstevel@tonic-gate } else if (l->la_value == 0) { 10777c478bd9Sstevel@tonic-gate /* 10787c478bd9Sstevel@tonic-gate * Record latency if haven't seen before 10797c478bd9Sstevel@tonic-gate */ 10807c478bd9Sstevel@tonic-gate l->la_value = t1; 10817c478bd9Sstevel@tonic-gate l->la_count++; 10827c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies++; 10837c478bd9Sstevel@tonic-gate break; 10847c478bd9Sstevel@tonic-gate } 10857c478bd9Sstevel@tonic-gate } 10867c478bd9Sstevel@tonic-gate } 10877c478bd9Sstevel@tonic-gate } 10887c478bd9Sstevel@tonic-gate 10897c478bd9Sstevel@tonic-gate /* 10907c478bd9Sstevel@tonic-gate * Number of latencies should be relative to number of 10917c478bd9Sstevel@tonic-gate * nodes in system: 10927c478bd9Sstevel@tonic-gate * - Same as nodes when nodes <= 2 10937c478bd9Sstevel@tonic-gate * - Less than nodes when nodes > 2 10947c478bd9Sstevel@tonic-gate * - Greater than 2 when nodes >= 4 10957c478bd9Sstevel@tonic-gate */ 10967c478bd9Sstevel@tonic-gate if ((lgrp_plat_node_cnt <= 2 && 10977c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 10987c478bd9Sstevel@tonic-gate (lgrp_plat_node_cnt > 2 && 10997c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 11007c478bd9Sstevel@tonic-gate (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 11017c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies <= 2)) 11027c478bd9Sstevel@tonic-gate return (-4); 11037c478bd9Sstevel@tonic-gate 11047c478bd9Sstevel@tonic-gate /* 11057c478bd9Sstevel@tonic-gate * There should be more than one occurrence of every latency 11067c478bd9Sstevel@tonic-gate * as long as probing is complete 11077c478bd9Sstevel@tonic-gate */ 11087c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 11097c478bd9Sstevel@tonic-gate l = &lgrp_plat_probe_lat_acct[i]; 11107c478bd9Sstevel@tonic-gate if (l->la_count <= 1) 11117c478bd9Sstevel@tonic-gate return (-5); 11127c478bd9Sstevel@tonic-gate } 11137c478bd9Sstevel@tonic-gate return (0); 11147c478bd9Sstevel@tonic-gate } 11157c478bd9Sstevel@tonic-gate 11167c478bd9Sstevel@tonic-gate 11177c478bd9Sstevel@tonic-gate /* 11187c478bd9Sstevel@tonic-gate * Set lgroup latencies for 2 level lgroup topology 11197c478bd9Sstevel@tonic-gate */ 11207c478bd9Sstevel@tonic-gate static void 11217c478bd9Sstevel@tonic-gate lgrp_plat_2level_setup(void) 11227c478bd9Sstevel@tonic-gate { 11237c478bd9Sstevel@tonic-gate int i; 11247c478bd9Sstevel@tonic-gate 11257c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt >= 4) 11267c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 11277c478bd9Sstevel@tonic-gate "MPO only optimizing for local and remote\n"); 11287c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 11297c478bd9Sstevel@tonic-gate int j; 11307c478bd9Sstevel@tonic-gate 11317c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 11327c478bd9Sstevel@tonic-gate if (i == j) 11337c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 2; 11347c478bd9Sstevel@tonic-gate else 11357c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 3; 11367c478bd9Sstevel@tonic-gate } 11377c478bd9Sstevel@tonic-gate } 11387c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_min = 2; 11397c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = 3; 11407c478bd9Sstevel@tonic-gate lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 11417c478bd9Sstevel@tonic-gate } 11427c478bd9Sstevel@tonic-gate 11437c478bd9Sstevel@tonic-gate 11447c478bd9Sstevel@tonic-gate /* 11457c478bd9Sstevel@tonic-gate * Return time needed to probe from current CPU to memory in given node 11467c478bd9Sstevel@tonic-gate */ 11477c478bd9Sstevel@tonic-gate static hrtime_t 11487c478bd9Sstevel@tonic-gate lgrp_plat_probe_time(int to) 11497c478bd9Sstevel@tonic-gate { 11507c478bd9Sstevel@tonic-gate caddr_t buf; 11517c478bd9Sstevel@tonic-gate uint_t dev; 11527c478bd9Sstevel@tonic-gate /* LINTED: set but not used in function */ 11537c478bd9Sstevel@tonic-gate volatile uint_t dev_vendor; 11547c478bd9Sstevel@tonic-gate hrtime_t elapsed; 11557c478bd9Sstevel@tonic-gate hrtime_t end; 11567c478bd9Sstevel@tonic-gate int from; 11577c478bd9Sstevel@tonic-gate int i; 11587c478bd9Sstevel@tonic-gate int ipl; 11597c478bd9Sstevel@tonic-gate hrtime_t max; 11607c478bd9Sstevel@tonic-gate hrtime_t min; 11617c478bd9Sstevel@tonic-gate hrtime_t start; 11628949bcd6Sandrei int cnt; 11637c478bd9Sstevel@tonic-gate extern int use_sse_pagecopy; 11647c478bd9Sstevel@tonic-gate 11657c478bd9Sstevel@tonic-gate /* 11667c478bd9Sstevel@tonic-gate * Determine ID of node containing current CPU 11677c478bd9Sstevel@tonic-gate */ 11687c478bd9Sstevel@tonic-gate from = LGRP_PLAT_CPU_TO_NODE(CPU); 11697c478bd9Sstevel@tonic-gate 11707c478bd9Sstevel@tonic-gate /* 11717c478bd9Sstevel@tonic-gate * Do common work for probing main memory 11727c478bd9Sstevel@tonic-gate */ 11737c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 11747c478bd9Sstevel@tonic-gate /* 11757c478bd9Sstevel@tonic-gate * Skip probing any nodes without memory and 11767c478bd9Sstevel@tonic-gate * set probe time to 0 11777c478bd9Sstevel@tonic-gate */ 11787c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_memory[to] == NULL) { 11797c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[from][to] = 0; 11807c478bd9Sstevel@tonic-gate return (0); 11817c478bd9Sstevel@tonic-gate } 11827c478bd9Sstevel@tonic-gate 11837c478bd9Sstevel@tonic-gate /* 11847c478bd9Sstevel@tonic-gate * Invalidate caches once instead of once every sample 11857c478bd9Sstevel@tonic-gate * which should cut cost of probing by a lot 11867c478bd9Sstevel@tonic-gate */ 11877c478bd9Sstevel@tonic-gate lgrp_plat_flush_cost = gethrtime(); 11887c478bd9Sstevel@tonic-gate invalidate_cache(); 11897c478bd9Sstevel@tonic-gate lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 11907c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 11917c478bd9Sstevel@tonic-gate } 11927c478bd9Sstevel@tonic-gate 11937c478bd9Sstevel@tonic-gate /* 11947c478bd9Sstevel@tonic-gate * Probe from current CPU to given memory using specified operation 11957c478bd9Sstevel@tonic-gate * and take specified number of samples 11967c478bd9Sstevel@tonic-gate */ 11977c478bd9Sstevel@tonic-gate max = 0; 11987c478bd9Sstevel@tonic-gate min = -1; 11997c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 12007c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost = gethrtime(); 12017c478bd9Sstevel@tonic-gate 12027c478bd9Sstevel@tonic-gate /* 12037c478bd9Sstevel@tonic-gate * Can't measure probe time if gethrtime() isn't working yet 12047c478bd9Sstevel@tonic-gate */ 12057c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 12067c478bd9Sstevel@tonic-gate return (0); 12077c478bd9Sstevel@tonic-gate 12087c478bd9Sstevel@tonic-gate switch (lgrp_plat_probe_op) { 12097c478bd9Sstevel@tonic-gate 12107c478bd9Sstevel@tonic-gate case LGRP_PLAT_PROBE_PGCPY: 12117c478bd9Sstevel@tonic-gate default: 12127c478bd9Sstevel@tonic-gate /* 12137c478bd9Sstevel@tonic-gate * Measure how long it takes to copy page 12147c478bd9Sstevel@tonic-gate * on top of itself 12157c478bd9Sstevel@tonic-gate */ 12167c478bd9Sstevel@tonic-gate buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 12177c478bd9Sstevel@tonic-gate 12187c478bd9Sstevel@tonic-gate kpreempt_disable(); 12197c478bd9Sstevel@tonic-gate ipl = splhigh(); 12207c478bd9Sstevel@tonic-gate start = gethrtime(); 12217c478bd9Sstevel@tonic-gate if (use_sse_pagecopy) 12227c478bd9Sstevel@tonic-gate hwblkpagecopy(buf, buf); 12237c478bd9Sstevel@tonic-gate else 12247c478bd9Sstevel@tonic-gate bcopy(buf, buf, PAGESIZE); 12257c478bd9Sstevel@tonic-gate end = gethrtime(); 12267c478bd9Sstevel@tonic-gate elapsed = end - start; 12277c478bd9Sstevel@tonic-gate splx(ipl); 12287c478bd9Sstevel@tonic-gate kpreempt_enable(); 12297c478bd9Sstevel@tonic-gate break; 12307c478bd9Sstevel@tonic-gate 12317c478bd9Sstevel@tonic-gate case LGRP_PLAT_PROBE_VENDOR: 12327c478bd9Sstevel@tonic-gate /* 12337c478bd9Sstevel@tonic-gate * Measure how long it takes to read vendor ID from 12347c478bd9Sstevel@tonic-gate * Northbridge 12357c478bd9Sstevel@tonic-gate */ 12367c478bd9Sstevel@tonic-gate dev = OPT_PCS_DEV_NODE0 + to; 12377c478bd9Sstevel@tonic-gate kpreempt_disable(); 12387c478bd9Sstevel@tonic-gate ipl = spl8(); 12397c478bd9Sstevel@tonic-gate outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 12407c478bd9Sstevel@tonic-gate OPT_PCS_OFF_VENDOR)); 12417c478bd9Sstevel@tonic-gate start = gethrtime(); 12428949bcd6Sandrei for (cnt = 0; cnt < lgrp_plat_probe_nreads; cnt++) 12437c478bd9Sstevel@tonic-gate dev_vendor = inl(PCI_CONFDATA); 12447c478bd9Sstevel@tonic-gate end = gethrtime(); 12458949bcd6Sandrei elapsed = (end - start) / lgrp_plat_probe_nreads; 12467c478bd9Sstevel@tonic-gate splx(ipl); 12477c478bd9Sstevel@tonic-gate kpreempt_enable(); 12487c478bd9Sstevel@tonic-gate break; 12497c478bd9Sstevel@tonic-gate } 12507c478bd9Sstevel@tonic-gate 12517c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 12527c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 12537c478bd9Sstevel@tonic-gate 12547c478bd9Sstevel@tonic-gate if (min == -1 || elapsed < min) 12557c478bd9Sstevel@tonic-gate min = elapsed; 12567c478bd9Sstevel@tonic-gate if (elapsed > max) 12577c478bd9Sstevel@tonic-gate max = elapsed; 12587c478bd9Sstevel@tonic-gate } 12597c478bd9Sstevel@tonic-gate 12607c478bd9Sstevel@tonic-gate /* 12617c478bd9Sstevel@tonic-gate * Update minimum and maximum probe times between 12627c478bd9Sstevel@tonic-gate * these two nodes 12637c478bd9Sstevel@tonic-gate */ 12647c478bd9Sstevel@tonic-gate if (min < lgrp_plat_probe_min[from][to] || 12657c478bd9Sstevel@tonic-gate lgrp_plat_probe_min[from][to] == 0) 12667c478bd9Sstevel@tonic-gate lgrp_plat_probe_min[from][to] = min; 12677c478bd9Sstevel@tonic-gate 12687c478bd9Sstevel@tonic-gate if (max > lgrp_plat_probe_max[from][to]) 12697c478bd9Sstevel@tonic-gate lgrp_plat_probe_max[from][to] = max; 12707c478bd9Sstevel@tonic-gate 12717c478bd9Sstevel@tonic-gate return (min); 12727c478bd9Sstevel@tonic-gate } 12737c478bd9Sstevel@tonic-gate 12747c478bd9Sstevel@tonic-gate 12757c478bd9Sstevel@tonic-gate /* 12767c478bd9Sstevel@tonic-gate * Probe memory in each node from current CPU to determine latency topology 12777c478bd9Sstevel@tonic-gate */ 12787c478bd9Sstevel@tonic-gate void 12797c478bd9Sstevel@tonic-gate lgrp_plat_probe(void) 12807c478bd9Sstevel@tonic-gate { 12817c478bd9Sstevel@tonic-gate int from; 12827c478bd9Sstevel@tonic-gate int i; 12837c478bd9Sstevel@tonic-gate hrtime_t probe_time; 12847c478bd9Sstevel@tonic-gate int to; 12857c478bd9Sstevel@tonic-gate 12867c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 12877c478bd9Sstevel@tonic-gate return; 12887c478bd9Sstevel@tonic-gate 12897c478bd9Sstevel@tonic-gate /* 12907c478bd9Sstevel@tonic-gate * Determine ID of node containing current CPU 12917c478bd9Sstevel@tonic-gate */ 12927c478bd9Sstevel@tonic-gate from = LGRP_PLAT_CPU_TO_NODE(CPU); 12937c478bd9Sstevel@tonic-gate 12947c478bd9Sstevel@tonic-gate /* 12957c478bd9Sstevel@tonic-gate * Don't need to probe if got times already 12967c478bd9Sstevel@tonic-gate */ 12977c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[from][from] != 0) 12987c478bd9Sstevel@tonic-gate return; 12997c478bd9Sstevel@tonic-gate 13007c478bd9Sstevel@tonic-gate /* 13017c478bd9Sstevel@tonic-gate * Read vendor ID in Northbridge or read and write page(s) 13027c478bd9Sstevel@tonic-gate * in each node from current CPU and remember how long it takes, 13037c478bd9Sstevel@tonic-gate * so we can build latency topology of machine later. 13047c478bd9Sstevel@tonic-gate * This should approximate the memory latency between each node. 13057c478bd9Sstevel@tonic-gate */ 13067c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_probe_nrounds; i++) 13077c478bd9Sstevel@tonic-gate for (to = 0; to < lgrp_plat_node_cnt; to++) { 13087c478bd9Sstevel@tonic-gate /* 13097c478bd9Sstevel@tonic-gate * Get probe time and bail out if can't get it yet 13107c478bd9Sstevel@tonic-gate */ 13117c478bd9Sstevel@tonic-gate probe_time = lgrp_plat_probe_time(to); 13127c478bd9Sstevel@tonic-gate if (probe_time == 0) 13137c478bd9Sstevel@tonic-gate return; 13147c478bd9Sstevel@tonic-gate 13157c478bd9Sstevel@tonic-gate /* 13167c478bd9Sstevel@tonic-gate * Keep lowest probe time as latency between nodes 13177c478bd9Sstevel@tonic-gate */ 13187c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[from][to] == 0 || 13197c478bd9Sstevel@tonic-gate probe_time < lgrp_plat_probe_times[from][to]) 13207c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[from][to] = probe_time; 13217c478bd9Sstevel@tonic-gate 13227c478bd9Sstevel@tonic-gate /* 13237c478bd9Sstevel@tonic-gate * Update overall minimum and maximum probe times 13247c478bd9Sstevel@tonic-gate * across all nodes 13257c478bd9Sstevel@tonic-gate */ 13267c478bd9Sstevel@tonic-gate if (probe_time < lgrp_plat_probe_time_min || 13277c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_min == -1) 13287c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_min = probe_time; 13297c478bd9Sstevel@tonic-gate if (probe_time > lgrp_plat_probe_time_max) 13307c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = probe_time; 13317c478bd9Sstevel@tonic-gate } 13327c478bd9Sstevel@tonic-gate 13337c478bd9Sstevel@tonic-gate /* 13347c478bd9Sstevel@tonic-gate * - Fix up latencies such that local latencies are same, 13357c478bd9Sstevel@tonic-gate * latency(i, j) == latency(j, i), etc. (if possible) 13367c478bd9Sstevel@tonic-gate * 13377c478bd9Sstevel@tonic-gate * - Verify that latencies look ok 13387c478bd9Sstevel@tonic-gate * 13397c478bd9Sstevel@tonic-gate * - Fallback to just optimizing for local and remote if 13407c478bd9Sstevel@tonic-gate * latencies didn't look right 13417c478bd9Sstevel@tonic-gate */ 13427c478bd9Sstevel@tonic-gate lgrp_plat_latency_adjust(); 13437c478bd9Sstevel@tonic-gate lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 13447c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_error_code) 13457c478bd9Sstevel@tonic-gate lgrp_plat_2level_setup(); 13467c478bd9Sstevel@tonic-gate } 13477c478bd9Sstevel@tonic-gate 13487c478bd9Sstevel@tonic-gate 13497c478bd9Sstevel@tonic-gate /* 13507c478bd9Sstevel@tonic-gate * Platform-specific initialization 13517c478bd9Sstevel@tonic-gate */ 13527c478bd9Sstevel@tonic-gate void 13537c478bd9Sstevel@tonic-gate lgrp_plat_main_init(void) 13547c478bd9Sstevel@tonic-gate { 13557c478bd9Sstevel@tonic-gate int curnode; 13567c478bd9Sstevel@tonic-gate int ht_limit; 13577c478bd9Sstevel@tonic-gate int i; 13587c478bd9Sstevel@tonic-gate 13597c478bd9Sstevel@tonic-gate /* 13607c478bd9Sstevel@tonic-gate * Print a notice that MPO is disabled when memory is interleaved 13617c478bd9Sstevel@tonic-gate * across nodes....Would do this when it is discovered, but can't 13627c478bd9Sstevel@tonic-gate * because it happens way too early during boot.... 13637c478bd9Sstevel@tonic-gate */ 13647c478bd9Sstevel@tonic-gate if (lgrp_plat_mem_intrlv) 13657c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 13667c478bd9Sstevel@tonic-gate "MPO disabled because memory is interleaved\n"); 13677c478bd9Sstevel@tonic-gate 13687c478bd9Sstevel@tonic-gate /* 13697c478bd9Sstevel@tonic-gate * Don't bother to do any probing if there is only one node or the 13707c478bd9Sstevel@tonic-gate * height of the lgroup topology less than or equal to 2 13717c478bd9Sstevel@tonic-gate */ 13727c478bd9Sstevel@tonic-gate ht_limit = lgrp_topo_ht_limit(); 13737c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1 || ht_limit <= 2) { 13747c478bd9Sstevel@tonic-gate /* 13757c478bd9Sstevel@tonic-gate * Setup lgroup latencies for 2 level lgroup topology 13767c478bd9Sstevel@tonic-gate * (ie. local and remote only) if they haven't been set yet 13777c478bd9Sstevel@tonic-gate */ 13787c478bd9Sstevel@tonic-gate if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 13797c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max == 0) 13807c478bd9Sstevel@tonic-gate lgrp_plat_2level_setup(); 13817c478bd9Sstevel@tonic-gate return; 13827c478bd9Sstevel@tonic-gate } 13837c478bd9Sstevel@tonic-gate 13847c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 13857c478bd9Sstevel@tonic-gate /* 13867c478bd9Sstevel@tonic-gate * Should have been able to probe from CPU 0 when it was added 13877c478bd9Sstevel@tonic-gate * to lgroup hierarchy, but may not have been able to then 13887c478bd9Sstevel@tonic-gate * because it happens so early in boot that gethrtime() hasn't 13897c478bd9Sstevel@tonic-gate * been initialized. (:-( 13907c478bd9Sstevel@tonic-gate */ 13917c478bd9Sstevel@tonic-gate curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 13927c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[curnode][curnode] == 0) 13937c478bd9Sstevel@tonic-gate lgrp_plat_probe(); 13947c478bd9Sstevel@tonic-gate 13957c478bd9Sstevel@tonic-gate return; 13967c478bd9Sstevel@tonic-gate } 13977c478bd9Sstevel@tonic-gate 13987c478bd9Sstevel@tonic-gate /* 13997c478bd9Sstevel@tonic-gate * When probing memory, use one page for every sample to determine 14007c478bd9Sstevel@tonic-gate * lgroup topology and taking multiple samples 14017c478bd9Sstevel@tonic-gate */ 14027c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_memsize == 0) 14037c478bd9Sstevel@tonic-gate lgrp_plat_probe_memsize = PAGESIZE * 14047c478bd9Sstevel@tonic-gate lgrp_plat_probe_nsamples; 14057c478bd9Sstevel@tonic-gate 14067c478bd9Sstevel@tonic-gate /* 14077c478bd9Sstevel@tonic-gate * Map memory in each node needed for probing to determine latency 14087c478bd9Sstevel@tonic-gate * topology 14097c478bd9Sstevel@tonic-gate */ 14107c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 14117c478bd9Sstevel@tonic-gate int mnode; 14127c478bd9Sstevel@tonic-gate 14137c478bd9Sstevel@tonic-gate /* 14147c478bd9Sstevel@tonic-gate * Skip this node and leave its probe page NULL 14157c478bd9Sstevel@tonic-gate * if it doesn't have any memory 14167c478bd9Sstevel@tonic-gate */ 14177c478bd9Sstevel@tonic-gate mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 14187c478bd9Sstevel@tonic-gate if (!mem_node_config[mnode].exists) { 14197c478bd9Sstevel@tonic-gate lgrp_plat_probe_memory[i] = NULL; 14207c478bd9Sstevel@tonic-gate continue; 14217c478bd9Sstevel@tonic-gate } 14227c478bd9Sstevel@tonic-gate 14237c478bd9Sstevel@tonic-gate /* 14247c478bd9Sstevel@tonic-gate * Allocate one kernel virtual page 14257c478bd9Sstevel@tonic-gate */ 14267c478bd9Sstevel@tonic-gate lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 14277c478bd9Sstevel@tonic-gate lgrp_plat_probe_memsize, VM_NOSLEEP); 14287c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_memory[i] == NULL) { 14297c478bd9Sstevel@tonic-gate cmn_err(CE_WARN, 14307c478bd9Sstevel@tonic-gate "lgrp_plat_main_init: couldn't allocate memory"); 14317c478bd9Sstevel@tonic-gate return; 14327c478bd9Sstevel@tonic-gate } 14337c478bd9Sstevel@tonic-gate 14347c478bd9Sstevel@tonic-gate /* 14357c478bd9Sstevel@tonic-gate * Map virtual page to first page in node 14367c478bd9Sstevel@tonic-gate */ 14377c478bd9Sstevel@tonic-gate hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 14387c478bd9Sstevel@tonic-gate lgrp_plat_probe_memsize, 14397c478bd9Sstevel@tonic-gate lgrp_plat_probe_pfn[i], 14407c478bd9Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 14417c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 14427c478bd9Sstevel@tonic-gate } 14437c478bd9Sstevel@tonic-gate 14447c478bd9Sstevel@tonic-gate /* 14457c478bd9Sstevel@tonic-gate * Probe from current CPU 14467c478bd9Sstevel@tonic-gate */ 14477c478bd9Sstevel@tonic-gate lgrp_plat_probe(); 14487c478bd9Sstevel@tonic-gate } 14497c478bd9Sstevel@tonic-gate 14507c478bd9Sstevel@tonic-gate /* 14517c478bd9Sstevel@tonic-gate * Allocate additional space for an lgroup. 14527c478bd9Sstevel@tonic-gate */ 14537c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14547c478bd9Sstevel@tonic-gate lgrp_t * 14557c478bd9Sstevel@tonic-gate lgrp_plat_alloc(lgrp_id_t lgrpid) 14567c478bd9Sstevel@tonic-gate { 14577c478bd9Sstevel@tonic-gate lgrp_t *lgrp; 14587c478bd9Sstevel@tonic-gate 14597c478bd9Sstevel@tonic-gate lgrp = &lgrp_space[nlgrps_alloc++]; 14607c478bd9Sstevel@tonic-gate if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 14617c478bd9Sstevel@tonic-gate return (NULL); 14627c478bd9Sstevel@tonic-gate return (lgrp); 14637c478bd9Sstevel@tonic-gate } 14647c478bd9Sstevel@tonic-gate 14657c478bd9Sstevel@tonic-gate /* 14667c478bd9Sstevel@tonic-gate * Platform handling for (re)configuration changes 14677c478bd9Sstevel@tonic-gate */ 14687c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14697c478bd9Sstevel@tonic-gate void 14707c478bd9Sstevel@tonic-gate lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 14717c478bd9Sstevel@tonic-gate { 14727c478bd9Sstevel@tonic-gate } 14737c478bd9Sstevel@tonic-gate 14747c478bd9Sstevel@tonic-gate /* 14757c478bd9Sstevel@tonic-gate * Return the platform handle for the lgroup containing the given CPU 14767c478bd9Sstevel@tonic-gate */ 14777c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14787c478bd9Sstevel@tonic-gate lgrp_handle_t 14797c478bd9Sstevel@tonic-gate lgrp_plat_cpu_to_hand(processorid_t id) 14807c478bd9Sstevel@tonic-gate { 14817c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt == 1) 14827c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 14837c478bd9Sstevel@tonic-gate 14847c478bd9Sstevel@tonic-gate return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 14857c478bd9Sstevel@tonic-gate } 14867c478bd9Sstevel@tonic-gate 14877c478bd9Sstevel@tonic-gate /* 14887c478bd9Sstevel@tonic-gate * Return the platform handle of the lgroup that contains the physical memory 14897c478bd9Sstevel@tonic-gate * corresponding to the given page frame number 14907c478bd9Sstevel@tonic-gate */ 14917c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14927c478bd9Sstevel@tonic-gate lgrp_handle_t 14937c478bd9Sstevel@tonic-gate lgrp_plat_pfn_to_hand(pfn_t pfn) 14947c478bd9Sstevel@tonic-gate { 14957c478bd9Sstevel@tonic-gate int mnode; 14967c478bd9Sstevel@tonic-gate 14977c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 14987c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 14997c478bd9Sstevel@tonic-gate 1500c39996a7Sstevel if (pfn > physmax) 1501c39996a7Sstevel return (LGRP_NULL_HANDLE); 1502c39996a7Sstevel 15037c478bd9Sstevel@tonic-gate mnode = plat_pfn_to_mem_node(pfn); 1504c39996a7Sstevel if (mnode < 0) 1505c39996a7Sstevel return (LGRP_NULL_HANDLE); 1506c39996a7Sstevel 15077c478bd9Sstevel@tonic-gate return (MEM_NODE_2_LGRPHAND(mnode)); 15087c478bd9Sstevel@tonic-gate } 15097c478bd9Sstevel@tonic-gate 15107c478bd9Sstevel@tonic-gate /* 15117c478bd9Sstevel@tonic-gate * Return the maximum number of lgrps supported by the platform. 15127c478bd9Sstevel@tonic-gate * Before lgrp topology is known it returns an estimate based on the number of 15137c478bd9Sstevel@tonic-gate * nodes. Once topology is known it returns the actual maximim number of lgrps 15147c478bd9Sstevel@tonic-gate * created. Since x86 doesn't support dynamic addition of new nodes, this number 15157c478bd9Sstevel@tonic-gate * may not grow during system lifetime. 15167c478bd9Sstevel@tonic-gate */ 15177c478bd9Sstevel@tonic-gate int 15187c478bd9Sstevel@tonic-gate lgrp_plat_max_lgrps() 15197c478bd9Sstevel@tonic-gate { 15207c478bd9Sstevel@tonic-gate return (lgrp_topo_initialized ? 15217c478bd9Sstevel@tonic-gate lgrp_alloc_max + 1 : 15227c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 15237c478bd9Sstevel@tonic-gate } 15247c478bd9Sstevel@tonic-gate 15257c478bd9Sstevel@tonic-gate /* 15267c478bd9Sstevel@tonic-gate * Return the number of free, allocatable, or installed 15277c478bd9Sstevel@tonic-gate * pages in an lgroup 15287c478bd9Sstevel@tonic-gate * This is a copy of the MAX_MEM_NODES == 1 version of the routine 15297c478bd9Sstevel@tonic-gate * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 15307c478bd9Sstevel@tonic-gate */ 15317c478bd9Sstevel@tonic-gate /* ARGSUSED */ 15327c478bd9Sstevel@tonic-gate static pgcnt_t 15337c478bd9Sstevel@tonic-gate lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 15347c478bd9Sstevel@tonic-gate { 15357c478bd9Sstevel@tonic-gate struct memlist *mlist; 15367c478bd9Sstevel@tonic-gate pgcnt_t npgs = 0; 15377c478bd9Sstevel@tonic-gate extern struct memlist *phys_avail; 15387c478bd9Sstevel@tonic-gate extern struct memlist *phys_install; 15397c478bd9Sstevel@tonic-gate 15407c478bd9Sstevel@tonic-gate switch (query) { 15417c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_FREE: 15427c478bd9Sstevel@tonic-gate return ((pgcnt_t)freemem); 15437c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_AVAIL: 15447c478bd9Sstevel@tonic-gate memlist_read_lock(); 15457c478bd9Sstevel@tonic-gate for (mlist = phys_avail; mlist; mlist = mlist->next) 15467c478bd9Sstevel@tonic-gate npgs += btop(mlist->size); 15477c478bd9Sstevel@tonic-gate memlist_read_unlock(); 15487c478bd9Sstevel@tonic-gate return (npgs); 15497c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_INSTALL: 15507c478bd9Sstevel@tonic-gate memlist_read_lock(); 15517c478bd9Sstevel@tonic-gate for (mlist = phys_install; mlist; mlist = mlist->next) 15527c478bd9Sstevel@tonic-gate npgs += btop(mlist->size); 15537c478bd9Sstevel@tonic-gate memlist_read_unlock(); 15547c478bd9Sstevel@tonic-gate return (npgs); 15557c478bd9Sstevel@tonic-gate default: 15567c478bd9Sstevel@tonic-gate return ((pgcnt_t)0); 15577c478bd9Sstevel@tonic-gate } 15587c478bd9Sstevel@tonic-gate } 15597c478bd9Sstevel@tonic-gate 15607c478bd9Sstevel@tonic-gate /* 15617c478bd9Sstevel@tonic-gate * Return the number of free pages in an lgroup. 15627c478bd9Sstevel@tonic-gate * 15637c478bd9Sstevel@tonic-gate * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 15647c478bd9Sstevel@tonic-gate * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 15657c478bd9Sstevel@tonic-gate * number of allocatable base pagesize pages corresponding to the 15667c478bd9Sstevel@tonic-gate * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 15677c478bd9Sstevel@tonic-gate * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 15687c478bd9Sstevel@tonic-gate * memory installed, regardless of whether or not it's usable. 15697c478bd9Sstevel@tonic-gate */ 15707c478bd9Sstevel@tonic-gate pgcnt_t 15717c478bd9Sstevel@tonic-gate lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 15727c478bd9Sstevel@tonic-gate { 15737c478bd9Sstevel@tonic-gate int mnode; 15747c478bd9Sstevel@tonic-gate pgcnt_t npgs = (pgcnt_t)0; 15757c478bd9Sstevel@tonic-gate extern struct memlist *phys_avail; 15767c478bd9Sstevel@tonic-gate extern struct memlist *phys_install; 15777c478bd9Sstevel@tonic-gate 15787c478bd9Sstevel@tonic-gate 15797c478bd9Sstevel@tonic-gate if (plathand == LGRP_DEFAULT_HANDLE) 15807c478bd9Sstevel@tonic-gate return (lgrp_plat_mem_size_default(plathand, query)); 15817c478bd9Sstevel@tonic-gate 15827c478bd9Sstevel@tonic-gate if (plathand != LGRP_NULL_HANDLE) { 15837c478bd9Sstevel@tonic-gate mnode = plat_lgrphand_to_mem_node(plathand); 15847c478bd9Sstevel@tonic-gate if (mnode >= 0 && mem_node_config[mnode].exists) { 15857c478bd9Sstevel@tonic-gate switch (query) { 15867c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_FREE: 1587affbd3ccSkchow npgs = MNODE_PGCNT(mnode); 15887c478bd9Sstevel@tonic-gate break; 15897c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_AVAIL: 15907c478bd9Sstevel@tonic-gate npgs = mem_node_memlist_pages(mnode, 15917c478bd9Sstevel@tonic-gate phys_avail); 15927c478bd9Sstevel@tonic-gate break; 15937c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_INSTALL: 15947c478bd9Sstevel@tonic-gate npgs = mem_node_memlist_pages(mnode, 15957c478bd9Sstevel@tonic-gate phys_install); 15967c478bd9Sstevel@tonic-gate break; 15977c478bd9Sstevel@tonic-gate default: 15987c478bd9Sstevel@tonic-gate break; 15997c478bd9Sstevel@tonic-gate } 16007c478bd9Sstevel@tonic-gate } 16017c478bd9Sstevel@tonic-gate } 16027c478bd9Sstevel@tonic-gate return (npgs); 16037c478bd9Sstevel@tonic-gate } 16047c478bd9Sstevel@tonic-gate 16057c478bd9Sstevel@tonic-gate /* 16067c478bd9Sstevel@tonic-gate * Return latency between "from" and "to" lgroups 16077c478bd9Sstevel@tonic-gate * 16087c478bd9Sstevel@tonic-gate * This latency number can only be used for relative comparison 16097c478bd9Sstevel@tonic-gate * between lgroups on the running system, cannot be used across platforms, 16107c478bd9Sstevel@tonic-gate * and may not reflect the actual latency. It is platform and implementation 16117c478bd9Sstevel@tonic-gate * specific, so platform gets to decide its value. It would be nice if the 16127c478bd9Sstevel@tonic-gate * number was at least proportional to make comparisons more meaningful though. 16137c478bd9Sstevel@tonic-gate */ 16147c478bd9Sstevel@tonic-gate /* ARGSUSED */ 16157c478bd9Sstevel@tonic-gate int 16167c478bd9Sstevel@tonic-gate lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 16177c478bd9Sstevel@tonic-gate { 16187c478bd9Sstevel@tonic-gate lgrp_handle_t src, dest; 16197c478bd9Sstevel@tonic-gate 16207c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 16217c478bd9Sstevel@tonic-gate return (0); 16227c478bd9Sstevel@tonic-gate 16237c478bd9Sstevel@tonic-gate /* 16247c478bd9Sstevel@tonic-gate * Return max latency for root lgroup 16257c478bd9Sstevel@tonic-gate */ 16267c478bd9Sstevel@tonic-gate if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 16277c478bd9Sstevel@tonic-gate return (lgrp_plat_probe_time_max); 16287c478bd9Sstevel@tonic-gate 16297c478bd9Sstevel@tonic-gate src = from; 16307c478bd9Sstevel@tonic-gate dest = to; 16317c478bd9Sstevel@tonic-gate 16327c478bd9Sstevel@tonic-gate /* 16337c478bd9Sstevel@tonic-gate * Return 0 for nodes (lgroup platform handles) out of range 16347c478bd9Sstevel@tonic-gate */ 16357c478bd9Sstevel@tonic-gate if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 16367c478bd9Sstevel@tonic-gate return (0); 16377c478bd9Sstevel@tonic-gate 16387c478bd9Sstevel@tonic-gate /* 16397c478bd9Sstevel@tonic-gate * Probe from current CPU if its lgroup latencies haven't been set yet 16407c478bd9Sstevel@tonic-gate * and we are trying to get latency from current CPU to some node 16417c478bd9Sstevel@tonic-gate */ 16427c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[src][src] == 0 && 16437c478bd9Sstevel@tonic-gate LGRP_PLAT_CPU_TO_NODE(CPU) == src) 16447c478bd9Sstevel@tonic-gate lgrp_plat_probe(); 16457c478bd9Sstevel@tonic-gate 16467c478bd9Sstevel@tonic-gate return (lgrp_plat_probe_times[src][dest]); 16477c478bd9Sstevel@tonic-gate } 16487c478bd9Sstevel@tonic-gate 16497c478bd9Sstevel@tonic-gate /* 16507c478bd9Sstevel@tonic-gate * Return platform handle for root lgroup 16517c478bd9Sstevel@tonic-gate */ 16527c478bd9Sstevel@tonic-gate lgrp_handle_t 16537c478bd9Sstevel@tonic-gate lgrp_plat_root_hand(void) 16547c478bd9Sstevel@tonic-gate { 16557c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 16567c478bd9Sstevel@tonic-gate } 1657