17c478bd9Sstevel@tonic-gate /* 27c478bd9Sstevel@tonic-gate * CDDL HEADER START 37c478bd9Sstevel@tonic-gate * 47c478bd9Sstevel@tonic-gate * The contents of this file are subject to the terms of the 5c39996a7Sstevel * Common Development and Distribution License (the "License"). 6c39996a7Sstevel * You may not use this file except in compliance with the License. 77c478bd9Sstevel@tonic-gate * 87c478bd9Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 97c478bd9Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 107c478bd9Sstevel@tonic-gate * See the License for the specific language governing permissions 117c478bd9Sstevel@tonic-gate * and limitations under the License. 127c478bd9Sstevel@tonic-gate * 137c478bd9Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 147c478bd9Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 157c478bd9Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 167c478bd9Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 177c478bd9Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 187c478bd9Sstevel@tonic-gate * 197c478bd9Sstevel@tonic-gate * CDDL HEADER END 207c478bd9Sstevel@tonic-gate */ 21c39996a7Sstevel 227c478bd9Sstevel@tonic-gate /* 23fb2f18f8Sesaxe * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 247c478bd9Sstevel@tonic-gate * Use is subject to license terms. 257c478bd9Sstevel@tonic-gate */ 267c478bd9Sstevel@tonic-gate 277c478bd9Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 287c478bd9Sstevel@tonic-gate 297c478bd9Sstevel@tonic-gate 307c478bd9Sstevel@tonic-gate #include <sys/archsystm.h> /* for {in,out}{b,w,l}() */ 317c478bd9Sstevel@tonic-gate #include <sys/cmn_err.h> 32*f78a91cdSjjc #include <sys/controlregs.h> 337c478bd9Sstevel@tonic-gate #include <sys/cpupart.h> 347c478bd9Sstevel@tonic-gate #include <sys/cpuvar.h> 357c478bd9Sstevel@tonic-gate #include <sys/lgrp.h> 367c478bd9Sstevel@tonic-gate #include <sys/machsystm.h> 377c478bd9Sstevel@tonic-gate #include <sys/memlist.h> 387c478bd9Sstevel@tonic-gate #include <sys/memnode.h> 397c478bd9Sstevel@tonic-gate #include <sys/mman.h> 40ef50d8c0Sesaxe #include <sys/pci_cfgspace.h> 41ef50d8c0Sesaxe #include <sys/pci_impl.h> 427c478bd9Sstevel@tonic-gate #include <sys/param.h> 43fb2f18f8Sesaxe #include <sys/pghw.h> 447c478bd9Sstevel@tonic-gate #include <sys/promif.h> /* for prom_printf() */ 457c478bd9Sstevel@tonic-gate #include <sys/systm.h> 467c478bd9Sstevel@tonic-gate #include <sys/thread.h> 477c478bd9Sstevel@tonic-gate #include <sys/types.h> 487c478bd9Sstevel@tonic-gate #include <sys/var.h> 497c478bd9Sstevel@tonic-gate #include <sys/x86_archext.h> /* for x86_feature and X86_AMD */ 507c478bd9Sstevel@tonic-gate #include <vm/hat_i86.h> 517c478bd9Sstevel@tonic-gate #include <vm/seg_kmem.h> 52affbd3ccSkchow #include <vm/vm_dep.h> 537c478bd9Sstevel@tonic-gate 547c478bd9Sstevel@tonic-gate 557c478bd9Sstevel@tonic-gate /* 567c478bd9Sstevel@tonic-gate * lgroup platform support for x86 platforms. 577c478bd9Sstevel@tonic-gate */ 587c478bd9Sstevel@tonic-gate 597c478bd9Sstevel@tonic-gate #define MAX_NODES 8 607c478bd9Sstevel@tonic-gate #define NLGRP (MAX_NODES * (MAX_NODES - 1) + 1) 617c478bd9Sstevel@tonic-gate 62fb2f18f8Sesaxe #define LGRP_PLAT_CPU_TO_NODE(cpu) (pg_plat_hw_instance_id(cpu, PGHW_CHIP)) 637c478bd9Sstevel@tonic-gate 647c478bd9Sstevel@tonic-gate #define LGRP_PLAT_PROBE_NROUNDS 64 /* default laps for probing */ 657c478bd9Sstevel@tonic-gate #define LGRP_PLAT_PROBE_NSAMPLES 1 /* default samples to take */ 668949bcd6Sandrei #define LGRP_PLAT_PROBE_NREADS 256 /* number of vendor ID reads */ 677c478bd9Sstevel@tonic-gate 687c478bd9Sstevel@tonic-gate /* 697c478bd9Sstevel@tonic-gate * Multiprocessor Opteron machines have Non Uniform Memory Access (NUMA). 707c478bd9Sstevel@tonic-gate * 71*f78a91cdSjjc * Until this code supports reading System Resource Affinity Table (SRAT), 727c478bd9Sstevel@tonic-gate * we need to examine registers in PCI configuration space to determine how 737c478bd9Sstevel@tonic-gate * many nodes are in the system and which CPUs and memory are in each node. 747c478bd9Sstevel@tonic-gate * This could be determined by probing all memory from each CPU, but that is 757c478bd9Sstevel@tonic-gate * too expensive to do while booting the kernel. 767c478bd9Sstevel@tonic-gate * 777c478bd9Sstevel@tonic-gate * NOTE: Using these PCI configuration space registers to determine this 78*f78a91cdSjjc * locality info is not guaranteed to work on future generations of 79*f78a91cdSjjc * Opteron processor. 807c478bd9Sstevel@tonic-gate */ 817c478bd9Sstevel@tonic-gate 827c478bd9Sstevel@tonic-gate /* 837c478bd9Sstevel@tonic-gate * Opteron DRAM Address Map in PCI configuration space gives base and limit 84*f78a91cdSjjc * of physical memory in each node. The following constants and macros define 85*f78a91cdSjjc * their contents, structure, and access. 867c478bd9Sstevel@tonic-gate */ 877c478bd9Sstevel@tonic-gate 887c478bd9Sstevel@tonic-gate /* 897c478bd9Sstevel@tonic-gate * How many bits to shift Opteron DRAM Address Map base and limit registers 907c478bd9Sstevel@tonic-gate * to get actual value 917c478bd9Sstevel@tonic-gate */ 92*f78a91cdSjjc #define OPT_DRAMADDR_HI_LSHIFT_ADDR 40 /* shift left for address */ 93*f78a91cdSjjc #define OPT_DRAMADDR_LO_LSHIFT_ADDR 8 /* shift left for address */ 947c478bd9Sstevel@tonic-gate 95*f78a91cdSjjc #define OPT_DRAMADDR_HI_MASK_ADDR 0x000000FF /* address bits 47-40 */ 96*f78a91cdSjjc #define OPT_DRAMADDR_LO_MASK_ADDR 0xFFFF0000 /* address bits 39-24 */ 97*f78a91cdSjjc 98*f78a91cdSjjc #define OPT_DRAMADDR_LO_MASK_OFF 0xFFFFFF /* offset for address */ 99*f78a91cdSjjc 100*f78a91cdSjjc /* 101*f78a91cdSjjc * Macros to derive addresses from Opteron DRAM Address Map registers 102*f78a91cdSjjc */ 103*f78a91cdSjjc #define OPT_DRAMADDR_HI(reg) \ 104*f78a91cdSjjc (((u_longlong_t)reg & OPT_DRAMADDR_HI_MASK_ADDR) << \ 105*f78a91cdSjjc OPT_DRAMADDR_HI_LSHIFT_ADDR) 106*f78a91cdSjjc 107*f78a91cdSjjc #define OPT_DRAMADDR_LO(reg) \ 108*f78a91cdSjjc (((u_longlong_t)reg & OPT_DRAMADDR_LO_MASK_ADDR) << \ 109*f78a91cdSjjc OPT_DRAMADDR_LO_LSHIFT_ADDR) 110*f78a91cdSjjc 111*f78a91cdSjjc #define OPT_DRAMADDR(high, low) \ 112*f78a91cdSjjc (OPT_DRAMADDR_HI(high) | OPT_DRAMADDR_LO(low)) 1137c478bd9Sstevel@tonic-gate 1147c478bd9Sstevel@tonic-gate /* 1157c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map base register 1167c478bd9Sstevel@tonic-gate */ 117*f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_RE 0x1 /* read enable */ 118*f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_WE 0x2 /* write enable */ 119*f78a91cdSjjc #define OPT_DRAMBASE_LO_MASK_INTRLVEN 0x700 /* interleave */ 1207c478bd9Sstevel@tonic-gate 1217c478bd9Sstevel@tonic-gate /* 1227c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron DRAM Address Map limit register 1237c478bd9Sstevel@tonic-gate */ 124*f78a91cdSjjc #define OPT_DRAMLIMIT_LO_MASK_DSTNODE 0x7 /* destination node */ 125*f78a91cdSjjc #define OPT_DRAMLIMIT_LO_MASK_INTRLVSEL 0x700 /* interleave select */ 1267c478bd9Sstevel@tonic-gate 1277c478bd9Sstevel@tonic-gate 1287c478bd9Sstevel@tonic-gate /* 1297c478bd9Sstevel@tonic-gate * Opteron Node ID register in PCI configuration space contains 1307c478bd9Sstevel@tonic-gate * number of nodes in system, etc. for Opteron K8. The following 1317c478bd9Sstevel@tonic-gate * constants and macros define its contents, structure, and access. 1327c478bd9Sstevel@tonic-gate */ 1337c478bd9Sstevel@tonic-gate 1347c478bd9Sstevel@tonic-gate /* 1357c478bd9Sstevel@tonic-gate * Bit masks defining what's in Opteron Node ID register 1367c478bd9Sstevel@tonic-gate */ 1377c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_ID 0x7 /* node ID */ 1387c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_CNT 0x70 /* node count */ 1397c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_IONODE 0x700 /* Hypertransport I/O hub node ID */ 1407c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_LCKNODE 0x7000 /* lock controller node ID */ 1417c478bd9Sstevel@tonic-gate #define OPT_NODE_MASK_CPUCNT 0xF0000 /* CPUs in system (0 means 1 CPU) */ 1427c478bd9Sstevel@tonic-gate 1437c478bd9Sstevel@tonic-gate /* 1447c478bd9Sstevel@tonic-gate * How many bits in Opteron Node ID register to shift right to get actual value 1457c478bd9Sstevel@tonic-gate */ 1467c478bd9Sstevel@tonic-gate #define OPT_NODE_RSHIFT_CNT 0x4 /* shift right for node count value */ 1477c478bd9Sstevel@tonic-gate 1487c478bd9Sstevel@tonic-gate /* 1497c478bd9Sstevel@tonic-gate * Macros to get values from Opteron Node ID register 1507c478bd9Sstevel@tonic-gate */ 1517c478bd9Sstevel@tonic-gate #define OPT_NODE_CNT(reg) \ 1527c478bd9Sstevel@tonic-gate ((reg & OPT_NODE_MASK_CNT) >> OPT_NODE_RSHIFT_CNT) 1537c478bd9Sstevel@tonic-gate 154*f78a91cdSjjc /* 155*f78a91cdSjjc * Macro to setup PCI Extended Configuration Space (ECS) address to give to 156*f78a91cdSjjc * "in/out" instructions 157*f78a91cdSjjc * 158*f78a91cdSjjc * NOTE: Should only be used in lgrp_plat_init() before MMIO setup because any 159*f78a91cdSjjc * other uses should just do MMIO to access PCI ECS. 160*f78a91cdSjjc * Must enable special bit in Northbridge Configuration Register on 161*f78a91cdSjjc * Greyhound for extended CF8 space access to be able to access PCI ECS 162*f78a91cdSjjc * using "in/out" instructions and restore special bit after done 163*f78a91cdSjjc * accessing PCI ECS. 164*f78a91cdSjjc */ 165*f78a91cdSjjc #define OPT_PCI_ECS_ADDR(bus, device, function, reg) \ 166*f78a91cdSjjc (PCI_CONE | (((bus) & 0xff) << 16) | (((device & 0x1f)) << 11) | \ 167*f78a91cdSjjc (((function) & 0x7) << 8) | ((reg) & 0xfc) | \ 168*f78a91cdSjjc ((((reg) >> 8) & 0xf) << 24)) 1697c478bd9Sstevel@tonic-gate 1707c478bd9Sstevel@tonic-gate /* 1717c478bd9Sstevel@tonic-gate * PCI configuration space registers accessed by specifying 1727c478bd9Sstevel@tonic-gate * a bus, device, function, and offset. The following constants 1737c478bd9Sstevel@tonic-gate * define the values needed to access Opteron K8 configuration 1747c478bd9Sstevel@tonic-gate * info to determine its node topology 1757c478bd9Sstevel@tonic-gate */ 1767c478bd9Sstevel@tonic-gate 1777c478bd9Sstevel@tonic-gate #define OPT_PCS_BUS_CONFIG 0 /* Hypertransport config space bus */ 1787c478bd9Sstevel@tonic-gate 1797c478bd9Sstevel@tonic-gate /* 1807c478bd9Sstevel@tonic-gate * Opteron PCI configuration space register function values 1817c478bd9Sstevel@tonic-gate */ 1827c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_HT 0 /* Hypertransport configuration */ 1837c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_ADDRMAP 1 /* Address map configuration */ 1847c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_DRAM 2 /* DRAM configuration */ 1857c478bd9Sstevel@tonic-gate #define OPT_PCS_FUNC_MISC 3 /* Miscellaneous configuration */ 1867c478bd9Sstevel@tonic-gate 1877c478bd9Sstevel@tonic-gate /* 1887c478bd9Sstevel@tonic-gate * PCI Configuration Space register offsets 1897c478bd9Sstevel@tonic-gate */ 1907c478bd9Sstevel@tonic-gate #define OPT_PCS_OFF_VENDOR 0x0 /* device/vendor ID register */ 191*f78a91cdSjjc #define OPT_PCS_OFF_DRAMBASE_HI 0x140 /* DRAM Base register (node 0) */ 192*f78a91cdSjjc #define OPT_PCS_OFF_DRAMBASE_LO 0x40 /* DRAM Base register (node 0) */ 1937c478bd9Sstevel@tonic-gate #define OPT_PCS_OFF_NODEID 0x60 /* Node ID register */ 1947c478bd9Sstevel@tonic-gate 1957c478bd9Sstevel@tonic-gate /* 1967c478bd9Sstevel@tonic-gate * Opteron PCI Configuration Space device IDs for nodes 1977c478bd9Sstevel@tonic-gate */ 1987c478bd9Sstevel@tonic-gate #define OPT_PCS_DEV_NODE0 24 /* device number for node 0 */ 1997c478bd9Sstevel@tonic-gate 2007c478bd9Sstevel@tonic-gate 2017c478bd9Sstevel@tonic-gate /* 2027c478bd9Sstevel@tonic-gate * Bookkeeping for latencies seen during probing (used for verification) 2037c478bd9Sstevel@tonic-gate */ 2047c478bd9Sstevel@tonic-gate typedef struct lgrp_plat_latency_acct { 2057c478bd9Sstevel@tonic-gate hrtime_t la_value; /* latency value */ 2067c478bd9Sstevel@tonic-gate int la_count; /* occurrences */ 2077c478bd9Sstevel@tonic-gate } lgrp_plat_latency_acct_t; 2087c478bd9Sstevel@tonic-gate 2097c478bd9Sstevel@tonic-gate 2107c478bd9Sstevel@tonic-gate /* 2117c478bd9Sstevel@tonic-gate * Choices for probing to determine lgroup topology 2127c478bd9Sstevel@tonic-gate */ 2137c478bd9Sstevel@tonic-gate typedef enum lgrp_plat_probe_op { 2147c478bd9Sstevel@tonic-gate LGRP_PLAT_PROBE_PGCPY, /* Use page copy */ 2157c478bd9Sstevel@tonic-gate LGRP_PLAT_PROBE_VENDOR /* Read vendor ID on Northbridge */ 2167c478bd9Sstevel@tonic-gate } lgrp_plat_probe_op_t; 2177c478bd9Sstevel@tonic-gate 2187c478bd9Sstevel@tonic-gate 2197c478bd9Sstevel@tonic-gate /* 2207c478bd9Sstevel@tonic-gate * Opteron DRAM address map gives base and limit for physical memory in a node 2217c478bd9Sstevel@tonic-gate */ 2227c478bd9Sstevel@tonic-gate typedef struct opt_dram_addr_map { 223*f78a91cdSjjc uint32_t base_hi; 224*f78a91cdSjjc uint32_t base_lo; 225*f78a91cdSjjc uint32_t limit_hi; 226*f78a91cdSjjc uint32_t limit_lo; 2277c478bd9Sstevel@tonic-gate } opt_dram_addr_map_t; 2287c478bd9Sstevel@tonic-gate 2297c478bd9Sstevel@tonic-gate 2307c478bd9Sstevel@tonic-gate /* 2317c478bd9Sstevel@tonic-gate * Starting and ending page for physical memory in node 2327c478bd9Sstevel@tonic-gate */ 2337c478bd9Sstevel@tonic-gate typedef struct phys_addr_map { 2347c478bd9Sstevel@tonic-gate pfn_t start; 2357c478bd9Sstevel@tonic-gate pfn_t end; 236a940d195Sjjc int exists; 2377c478bd9Sstevel@tonic-gate } phys_addr_map_t; 2387c478bd9Sstevel@tonic-gate 2397c478bd9Sstevel@tonic-gate 2407c478bd9Sstevel@tonic-gate /* 2417c478bd9Sstevel@tonic-gate * Opteron DRAM address map for each node 2427c478bd9Sstevel@tonic-gate */ 2437c478bd9Sstevel@tonic-gate struct opt_dram_addr_map opt_dram_map[MAX_NODES]; 2447c478bd9Sstevel@tonic-gate 2457c478bd9Sstevel@tonic-gate /* 2467c478bd9Sstevel@tonic-gate * Node ID register contents for each node 2477c478bd9Sstevel@tonic-gate */ 2487c478bd9Sstevel@tonic-gate uint_t opt_node_info[MAX_NODES]; 2497c478bd9Sstevel@tonic-gate 2507c478bd9Sstevel@tonic-gate /* 2517c478bd9Sstevel@tonic-gate * Whether memory is interleaved across nodes causing MPO to be disabled 2527c478bd9Sstevel@tonic-gate */ 2537c478bd9Sstevel@tonic-gate int lgrp_plat_mem_intrlv = 0; 2547c478bd9Sstevel@tonic-gate 2557c478bd9Sstevel@tonic-gate /* 2567c478bd9Sstevel@tonic-gate * Number of nodes in system 2577c478bd9Sstevel@tonic-gate */ 2587c478bd9Sstevel@tonic-gate uint_t lgrp_plat_node_cnt = 1; 2597c478bd9Sstevel@tonic-gate 2607c478bd9Sstevel@tonic-gate /* 2617c478bd9Sstevel@tonic-gate * Physical address range for memory in each node 2627c478bd9Sstevel@tonic-gate */ 2637c478bd9Sstevel@tonic-gate phys_addr_map_t lgrp_plat_node_memory[MAX_NODES]; 2647c478bd9Sstevel@tonic-gate 2657c478bd9Sstevel@tonic-gate /* 2667c478bd9Sstevel@tonic-gate * Probe costs (individual and total) and flush cost 2677c478bd9Sstevel@tonic-gate */ 2687c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_flush_cost = 0; 2697c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_cost = 0; 2707c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_cost_total = 0; 2717c478bd9Sstevel@tonic-gate 2727c478bd9Sstevel@tonic-gate /* 2737c478bd9Sstevel@tonic-gate * Error code for latency adjustment and verification 2747c478bd9Sstevel@tonic-gate */ 2757c478bd9Sstevel@tonic-gate int lgrp_plat_probe_error_code = 0; 2767c478bd9Sstevel@tonic-gate 2777c478bd9Sstevel@tonic-gate /* 2787c478bd9Sstevel@tonic-gate * How much latencies were off from minimum values gotten 2797c478bd9Sstevel@tonic-gate */ 2807c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_errors[MAX_NODES][MAX_NODES]; 2817c478bd9Sstevel@tonic-gate 2827c478bd9Sstevel@tonic-gate /* 2837c478bd9Sstevel@tonic-gate * Unique probe latencies and number of occurrences of each 2847c478bd9Sstevel@tonic-gate */ 2857c478bd9Sstevel@tonic-gate lgrp_plat_latency_acct_t lgrp_plat_probe_lat_acct[MAX_NODES]; 2867c478bd9Sstevel@tonic-gate 2877c478bd9Sstevel@tonic-gate /* 2887c478bd9Sstevel@tonic-gate * Size of memory buffer in each node for probing 2897c478bd9Sstevel@tonic-gate */ 2907c478bd9Sstevel@tonic-gate size_t lgrp_plat_probe_memsize = 0; 2917c478bd9Sstevel@tonic-gate 2927c478bd9Sstevel@tonic-gate /* 2937c478bd9Sstevel@tonic-gate * Virtual address of page in each node for probing 2947c478bd9Sstevel@tonic-gate */ 2957c478bd9Sstevel@tonic-gate caddr_t lgrp_plat_probe_memory[MAX_NODES]; 2967c478bd9Sstevel@tonic-gate 2977c478bd9Sstevel@tonic-gate /* 2987c478bd9Sstevel@tonic-gate * Number of unique latencies in probe times 2997c478bd9Sstevel@tonic-gate */ 3007c478bd9Sstevel@tonic-gate int lgrp_plat_probe_nlatencies = 0; 3017c478bd9Sstevel@tonic-gate 3027c478bd9Sstevel@tonic-gate /* 3037c478bd9Sstevel@tonic-gate * How many rounds of probing to do 3047c478bd9Sstevel@tonic-gate */ 3057c478bd9Sstevel@tonic-gate int lgrp_plat_probe_nrounds = LGRP_PLAT_PROBE_NROUNDS; 3067c478bd9Sstevel@tonic-gate 3077c478bd9Sstevel@tonic-gate /* 3087c478bd9Sstevel@tonic-gate * Number of samples to take when probing each node 3097c478bd9Sstevel@tonic-gate */ 3107c478bd9Sstevel@tonic-gate int lgrp_plat_probe_nsamples = LGRP_PLAT_PROBE_NSAMPLES; 3117c478bd9Sstevel@tonic-gate 3127c478bd9Sstevel@tonic-gate /* 3138949bcd6Sandrei * Number of times to read vendor ID from Northbridge for each probe. 3148949bcd6Sandrei */ 3158949bcd6Sandrei int lgrp_plat_probe_nreads = LGRP_PLAT_PROBE_NREADS; 3168949bcd6Sandrei 3178949bcd6Sandrei /* 3187c478bd9Sstevel@tonic-gate * How to probe to determine lgroup topology 3197c478bd9Sstevel@tonic-gate */ 3207c478bd9Sstevel@tonic-gate lgrp_plat_probe_op_t lgrp_plat_probe_op = LGRP_PLAT_PROBE_VENDOR; 3217c478bd9Sstevel@tonic-gate 3227c478bd9Sstevel@tonic-gate /* 3237c478bd9Sstevel@tonic-gate * PFN of page in each node for probing 3247c478bd9Sstevel@tonic-gate */ 3257c478bd9Sstevel@tonic-gate pfn_t lgrp_plat_probe_pfn[MAX_NODES]; 3267c478bd9Sstevel@tonic-gate 3277c478bd9Sstevel@tonic-gate /* 3287c478bd9Sstevel@tonic-gate * Whether probe time was suspect (ie. not within tolerance of value that it 3297c478bd9Sstevel@tonic-gate * should match) 3307c478bd9Sstevel@tonic-gate */ 3317c478bd9Sstevel@tonic-gate int lgrp_plat_probe_suspect[MAX_NODES][MAX_NODES]; 3327c478bd9Sstevel@tonic-gate 3337c478bd9Sstevel@tonic-gate /* 3347c478bd9Sstevel@tonic-gate * How long it takes to access memory from each node 3357c478bd9Sstevel@tonic-gate */ 3367c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_times[MAX_NODES][MAX_NODES]; 3377c478bd9Sstevel@tonic-gate 3387c478bd9Sstevel@tonic-gate /* 3397c478bd9Sstevel@tonic-gate * Min and max node memory probe times seen 3407c478bd9Sstevel@tonic-gate */ 3417c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_time_max = 0; 3427c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_time_min = -1; 3437c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_max[MAX_NODES][MAX_NODES]; 3447c478bd9Sstevel@tonic-gate hrtime_t lgrp_plat_probe_min[MAX_NODES][MAX_NODES]; 3457c478bd9Sstevel@tonic-gate 3467c478bd9Sstevel@tonic-gate 3477c478bd9Sstevel@tonic-gate /* 3487c478bd9Sstevel@tonic-gate * Allocate lgrp and lgrp stat arrays statically. 3497c478bd9Sstevel@tonic-gate */ 3507c478bd9Sstevel@tonic-gate static lgrp_t lgrp_space[NLGRP]; 3517c478bd9Sstevel@tonic-gate static int nlgrps_alloc; 3527c478bd9Sstevel@tonic-gate 3537c478bd9Sstevel@tonic-gate struct lgrp_stats lgrp_stats[NLGRP]; 3547c478bd9Sstevel@tonic-gate 355*f78a91cdSjjc /* 356*f78a91cdSjjc * Supported AMD processor families 357*f78a91cdSjjc */ 358*f78a91cdSjjc #define AMD_FAMILY_HAMMER 15 359*f78a91cdSjjc #define AMD_FAMILY_GREYHOUND 16 3607c478bd9Sstevel@tonic-gate 361*f78a91cdSjjc /* 362*f78a91cdSjjc * Whether to have is_opteron() return 1 even when processor isn't 363*f78a91cdSjjc * supported 364*f78a91cdSjjc */ 365*f78a91cdSjjc uint_t is_opteron_override = 0; 366*f78a91cdSjjc 367*f78a91cdSjjc /* 368*f78a91cdSjjc * AMD processor family for current CPU 369*f78a91cdSjjc */ 3707c478bd9Sstevel@tonic-gate uint_t opt_family = 0; 371*f78a91cdSjjc 3727c478bd9Sstevel@tonic-gate uint_t opt_probe_func = OPT_PCS_FUNC_DRAM; 3737c478bd9Sstevel@tonic-gate 3747c478bd9Sstevel@tonic-gate 3757c478bd9Sstevel@tonic-gate /* 376*f78a91cdSjjc * Determine whether we're running on a supported AMD Opteron since reading 377*f78a91cdSjjc * node count and DRAM address map registers may have different format or 378*f78a91cdSjjc * may not be supported in future processor families 3797c478bd9Sstevel@tonic-gate */ 3807c478bd9Sstevel@tonic-gate int 3817c478bd9Sstevel@tonic-gate is_opteron(void) 3827c478bd9Sstevel@tonic-gate { 383*f78a91cdSjjc 3847c478bd9Sstevel@tonic-gate if (x86_vendor != X86_VENDOR_AMD) 3857c478bd9Sstevel@tonic-gate return (0); 3867c478bd9Sstevel@tonic-gate 387*f78a91cdSjjc opt_family = cpuid_getfamily(CPU); 388*f78a91cdSjjc if (opt_family == AMD_FAMILY_HAMMER || 389*f78a91cdSjjc opt_family == AMD_FAMILY_GREYHOUND || is_opteron_override) 3907c478bd9Sstevel@tonic-gate return (1); 3917c478bd9Sstevel@tonic-gate else 3927c478bd9Sstevel@tonic-gate return (0); 3937c478bd9Sstevel@tonic-gate } 3947c478bd9Sstevel@tonic-gate 3957c478bd9Sstevel@tonic-gate int 3967c478bd9Sstevel@tonic-gate plat_lgrphand_to_mem_node(lgrp_handle_t hand) 3977c478bd9Sstevel@tonic-gate { 3987c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 3997c478bd9Sstevel@tonic-gate return (0); 4007c478bd9Sstevel@tonic-gate 4017c478bd9Sstevel@tonic-gate return ((int)hand); 4027c478bd9Sstevel@tonic-gate } 4037c478bd9Sstevel@tonic-gate 4047c478bd9Sstevel@tonic-gate lgrp_handle_t 4057c478bd9Sstevel@tonic-gate plat_mem_node_to_lgrphand(int mnode) 4067c478bd9Sstevel@tonic-gate { 4077c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 4087c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 4097c478bd9Sstevel@tonic-gate 4107c478bd9Sstevel@tonic-gate return ((lgrp_handle_t)mnode); 4117c478bd9Sstevel@tonic-gate } 4127c478bd9Sstevel@tonic-gate 4137c478bd9Sstevel@tonic-gate int 4147c478bd9Sstevel@tonic-gate plat_pfn_to_mem_node(pfn_t pfn) 4157c478bd9Sstevel@tonic-gate { 4167c478bd9Sstevel@tonic-gate int node; 4177c478bd9Sstevel@tonic-gate 4187c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 4197c478bd9Sstevel@tonic-gate return (0); 4207c478bd9Sstevel@tonic-gate 4217c478bd9Sstevel@tonic-gate for (node = 0; node < lgrp_plat_node_cnt; node++) { 422a940d195Sjjc /* 423a940d195Sjjc * Skip nodes with no memory 424a940d195Sjjc */ 425a940d195Sjjc if (!lgrp_plat_node_memory[node].exists) 426a940d195Sjjc continue; 427a940d195Sjjc 4287c478bd9Sstevel@tonic-gate if (pfn >= lgrp_plat_node_memory[node].start && 4297c478bd9Sstevel@tonic-gate pfn <= lgrp_plat_node_memory[node].end) 4307c478bd9Sstevel@tonic-gate return (node); 4317c478bd9Sstevel@tonic-gate } 4327c478bd9Sstevel@tonic-gate 4337c478bd9Sstevel@tonic-gate ASSERT(node < lgrp_plat_node_cnt); 4347c478bd9Sstevel@tonic-gate return (-1); 4357c478bd9Sstevel@tonic-gate } 4367c478bd9Sstevel@tonic-gate 4377c478bd9Sstevel@tonic-gate /* 4387c478bd9Sstevel@tonic-gate * Configure memory nodes for machines with more than one node (ie NUMA) 4397c478bd9Sstevel@tonic-gate */ 4407c478bd9Sstevel@tonic-gate void 4417c478bd9Sstevel@tonic-gate plat_build_mem_nodes(struct memlist *list) 4427c478bd9Sstevel@tonic-gate { 443a940d195Sjjc pfn_t cur_start; /* start addr of subrange */ 444a940d195Sjjc pfn_t cur_end; /* end addr of subrange */ 445a940d195Sjjc pfn_t start; /* start addr of whole range */ 446a940d195Sjjc pfn_t end; /* end addr of whole range */ 4477c478bd9Sstevel@tonic-gate 4487c478bd9Sstevel@tonic-gate /* 4497c478bd9Sstevel@tonic-gate * Boot install lists are arranged <addr, len>, ... 4507c478bd9Sstevel@tonic-gate */ 4517c478bd9Sstevel@tonic-gate while (list) { 4527c478bd9Sstevel@tonic-gate int node; 4537c478bd9Sstevel@tonic-gate 4547c478bd9Sstevel@tonic-gate start = list->address >> PAGESHIFT; 4557c478bd9Sstevel@tonic-gate end = (list->address + list->size - 1) >> PAGESHIFT; 4567c478bd9Sstevel@tonic-gate 4577c478bd9Sstevel@tonic-gate if (start > physmax) { 4587c478bd9Sstevel@tonic-gate list = list->next; 4597c478bd9Sstevel@tonic-gate continue; 4607c478bd9Sstevel@tonic-gate } 4617c478bd9Sstevel@tonic-gate if (end > physmax) 4627c478bd9Sstevel@tonic-gate end = physmax; 4637c478bd9Sstevel@tonic-gate 4647c478bd9Sstevel@tonic-gate /* 4657c478bd9Sstevel@tonic-gate * When there is only one memnode, just add memory to memnode 4667c478bd9Sstevel@tonic-gate */ 4677c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) { 4687c478bd9Sstevel@tonic-gate mem_node_add_slice(start, end); 4697c478bd9Sstevel@tonic-gate list = list->next; 4707c478bd9Sstevel@tonic-gate continue; 4717c478bd9Sstevel@tonic-gate } 4727c478bd9Sstevel@tonic-gate 4737c478bd9Sstevel@tonic-gate /* 4747c478bd9Sstevel@tonic-gate * mem_node_add_slice() expects to get a memory range that 4757c478bd9Sstevel@tonic-gate * is within one memnode, so need to split any memory range 4767c478bd9Sstevel@tonic-gate * that spans multiple memnodes into subranges that are each 4777c478bd9Sstevel@tonic-gate * contained within one memnode when feeding them to 4787c478bd9Sstevel@tonic-gate * mem_node_add_slice() 4797c478bd9Sstevel@tonic-gate */ 4807c478bd9Sstevel@tonic-gate cur_start = start; 4817c478bd9Sstevel@tonic-gate do { 4827c478bd9Sstevel@tonic-gate node = plat_pfn_to_mem_node(cur_start); 4837c478bd9Sstevel@tonic-gate 484a940d195Sjjc /* 485a940d195Sjjc * Panic if DRAM address map registers or SRAT say 486a940d195Sjjc * memory in node doesn't exist or address from 487a940d195Sjjc * boot installed memory list entry isn't in this node. 488a940d195Sjjc * This shouldn't happen and rest of code can't deal 489a940d195Sjjc * with this if it does. 490a940d195Sjjc */ 491a940d195Sjjc if (node < 0 || node >= lgrp_plat_node_cnt || 492a940d195Sjjc !lgrp_plat_node_memory[node].exists || 493a940d195Sjjc cur_start < lgrp_plat_node_memory[node].start || 494a940d195Sjjc cur_start > lgrp_plat_node_memory[node].end) { 495a940d195Sjjc cmn_err(CE_PANIC, "Don't know which memnode " 496a940d195Sjjc "to add installed memory address 0x%lx\n", 497a940d195Sjjc cur_start); 498a940d195Sjjc } 4997c478bd9Sstevel@tonic-gate 5007c478bd9Sstevel@tonic-gate /* 5017c478bd9Sstevel@tonic-gate * End of current subrange should not span memnodes 5027c478bd9Sstevel@tonic-gate */ 503a940d195Sjjc cur_end = end; 504a940d195Sjjc if (lgrp_plat_node_memory[node].exists && 505a940d195Sjjc cur_end > lgrp_plat_node_memory[node].end) 5067c478bd9Sstevel@tonic-gate cur_end = lgrp_plat_node_memory[node].end; 5077c478bd9Sstevel@tonic-gate 5087c478bd9Sstevel@tonic-gate mem_node_add_slice(cur_start, cur_end); 5097c478bd9Sstevel@tonic-gate 5107c478bd9Sstevel@tonic-gate /* 5117c478bd9Sstevel@tonic-gate * Next subrange starts after end of current one 5127c478bd9Sstevel@tonic-gate */ 5137c478bd9Sstevel@tonic-gate cur_start = cur_end + 1; 5147c478bd9Sstevel@tonic-gate } while (cur_end < end); 5157c478bd9Sstevel@tonic-gate 5167c478bd9Sstevel@tonic-gate list = list->next; 5177c478bd9Sstevel@tonic-gate } 5187c478bd9Sstevel@tonic-gate mem_node_physalign = 0; 5197c478bd9Sstevel@tonic-gate mem_node_pfn_shift = 0; 5207c478bd9Sstevel@tonic-gate } 5217c478bd9Sstevel@tonic-gate 5227c478bd9Sstevel@tonic-gate 5237c478bd9Sstevel@tonic-gate /* 5247c478bd9Sstevel@tonic-gate * Platform-specific initialization of lgroups 5257c478bd9Sstevel@tonic-gate */ 5267c478bd9Sstevel@tonic-gate void 5277c478bd9Sstevel@tonic-gate lgrp_plat_init(void) 5287c478bd9Sstevel@tonic-gate { 5297c478bd9Sstevel@tonic-gate uint_t bus; 5307c478bd9Sstevel@tonic-gate uint_t dev; 5317c478bd9Sstevel@tonic-gate uint_t node; 532*f78a91cdSjjc uint_t off_hi; 533*f78a91cdSjjc uint_t off_lo; 534*f78a91cdSjjc uint64_t nb_cfg_reg; 5357c478bd9Sstevel@tonic-gate 5367c478bd9Sstevel@tonic-gate extern lgrp_load_t lgrp_expand_proc_thresh; 5377c478bd9Sstevel@tonic-gate extern lgrp_load_t lgrp_expand_proc_diff; 5387c478bd9Sstevel@tonic-gate 5397c478bd9Sstevel@tonic-gate /* 5407c478bd9Sstevel@tonic-gate * Initialize as a UMA machine if this isn't an Opteron 5417c478bd9Sstevel@tonic-gate */ 5427c478bd9Sstevel@tonic-gate if (!is_opteron() || lgrp_topo_ht_limit() == 1) { 5437c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt = max_mem_nodes = 1; 5447c478bd9Sstevel@tonic-gate return; 5457c478bd9Sstevel@tonic-gate } 5467c478bd9Sstevel@tonic-gate 5477c478bd9Sstevel@tonic-gate /* 5487c478bd9Sstevel@tonic-gate * Read configuration registers from PCI configuration space to 5497c478bd9Sstevel@tonic-gate * determine node information, which memory is in each node, etc. 5507c478bd9Sstevel@tonic-gate * 5517c478bd9Sstevel@tonic-gate * Write to PCI configuration space address register to specify 5527c478bd9Sstevel@tonic-gate * which configuration register to read and read/write PCI 5537c478bd9Sstevel@tonic-gate * configuration space data register to get/set contents 5547c478bd9Sstevel@tonic-gate */ 5557c478bd9Sstevel@tonic-gate bus = OPT_PCS_BUS_CONFIG; 5567c478bd9Sstevel@tonic-gate dev = OPT_PCS_DEV_NODE0; 557*f78a91cdSjjc off_hi = OPT_PCS_OFF_DRAMBASE_HI; 558*f78a91cdSjjc off_lo = OPT_PCS_OFF_DRAMBASE_LO; 5597c478bd9Sstevel@tonic-gate 5607c478bd9Sstevel@tonic-gate /* 5617c478bd9Sstevel@tonic-gate * Read node ID register for node 0 to get node count 5627c478bd9Sstevel@tonic-gate */ 563ef50d8c0Sesaxe opt_node_info[0] = pci_getl_func(bus, dev, OPT_PCS_FUNC_HT, 564ef50d8c0Sesaxe OPT_PCS_OFF_NODEID); 5657c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt = OPT_NODE_CNT(opt_node_info[0]) + 1; 5667c478bd9Sstevel@tonic-gate 567*f78a91cdSjjc /* 568*f78a91cdSjjc * For Greyhound, PCI Extended Configuration Space must be enabled to 569*f78a91cdSjjc * read high DRAM address map base and limit registers 570*f78a91cdSjjc */ 571*f78a91cdSjjc if (opt_family == AMD_FAMILY_GREYHOUND) { 572*f78a91cdSjjc nb_cfg_reg = rdmsr(MSR_AMD_NB_CFG); 573*f78a91cdSjjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 574*f78a91cdSjjc wrmsr(MSR_AMD_NB_CFG, 575*f78a91cdSjjc nb_cfg_reg | AMD_GH_NB_CFG_EN_ECS); 576*f78a91cdSjjc } 577*f78a91cdSjjc 5787c478bd9Sstevel@tonic-gate for (node = 0; node < lgrp_plat_node_cnt; node++) { 579*f78a91cdSjjc uint32_t base_hi; 580*f78a91cdSjjc uint32_t base_lo; 581*f78a91cdSjjc uint32_t limit_hi; 582*f78a91cdSjjc uint32_t limit_lo; 583*f78a91cdSjjc 5847c478bd9Sstevel@tonic-gate /* 5857c478bd9Sstevel@tonic-gate * Read node ID register (except for node 0 which we just read) 5867c478bd9Sstevel@tonic-gate */ 5877c478bd9Sstevel@tonic-gate if (node > 0) { 588ef50d8c0Sesaxe opt_node_info[node] = pci_getl_func(bus, dev, 589ef50d8c0Sesaxe OPT_PCS_FUNC_HT, OPT_PCS_OFF_NODEID); 5907c478bd9Sstevel@tonic-gate } 5917c478bd9Sstevel@tonic-gate 5927c478bd9Sstevel@tonic-gate /* 5937c478bd9Sstevel@tonic-gate * Read DRAM base and limit registers which specify 5947c478bd9Sstevel@tonic-gate * physical memory range of each node 5957c478bd9Sstevel@tonic-gate */ 596*f78a91cdSjjc if (opt_family != AMD_FAMILY_GREYHOUND) 597*f78a91cdSjjc base_hi = 0; 598*f78a91cdSjjc else { 599*f78a91cdSjjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 600*f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 601*f78a91cdSjjc base_hi = opt_dram_map[node].base_hi = 602*f78a91cdSjjc inl(PCI_CONFDATA); 603*f78a91cdSjjc } 604*f78a91cdSjjc base_lo = opt_dram_map[node].base_lo = pci_getl_func(bus, dev, 605*f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_lo); 606*f78a91cdSjjc 607*f78a91cdSjjc if (opt_dram_map[node].base_lo & OPT_DRAMBASE_LO_MASK_INTRLVEN) 6087c478bd9Sstevel@tonic-gate lgrp_plat_mem_intrlv++; 6097c478bd9Sstevel@tonic-gate 610*f78a91cdSjjc off_hi += 4; /* high limit register offset */ 611*f78a91cdSjjc if (opt_family != AMD_FAMILY_GREYHOUND) 612*f78a91cdSjjc limit_hi = 0; 613*f78a91cdSjjc else { 614*f78a91cdSjjc outl(PCI_CONFADD, OPT_PCI_ECS_ADDR(bus, dev, 615*f78a91cdSjjc OPT_PCS_FUNC_ADDRMAP, off_hi)); 616*f78a91cdSjjc limit_hi = opt_dram_map[node].limit_hi = 617*f78a91cdSjjc inl(PCI_CONFDATA); 618*f78a91cdSjjc } 619*f78a91cdSjjc 620*f78a91cdSjjc off_lo += 4; /* low limit register offset */ 621*f78a91cdSjjc limit_lo = opt_dram_map[node].limit_lo = pci_getl_func(bus, 622*f78a91cdSjjc dev, OPT_PCS_FUNC_ADDRMAP, off_lo); 6237c478bd9Sstevel@tonic-gate 6247c478bd9Sstevel@tonic-gate /* 625*f78a91cdSjjc * Increment device number to next node and register offsets 626*f78a91cdSjjc * for DRAM base register of next node 6277c478bd9Sstevel@tonic-gate */ 628*f78a91cdSjjc off_hi += 4; 629*f78a91cdSjjc off_lo += 4; 6307c478bd9Sstevel@tonic-gate dev++; 6317c478bd9Sstevel@tonic-gate 6327c478bd9Sstevel@tonic-gate /* 633a940d195Sjjc * Both read and write enable bits must be enabled in DRAM 634a940d195Sjjc * address map base register for physical memory to exist in 635a940d195Sjjc * node 636a940d195Sjjc */ 637*f78a91cdSjjc if ((base_lo & OPT_DRAMBASE_LO_MASK_RE) == 0 || 638*f78a91cdSjjc (base_lo & OPT_DRAMBASE_LO_MASK_WE) == 0) { 639a940d195Sjjc /* 640a940d195Sjjc * Mark node memory as non-existent and set start and 641a940d195Sjjc * end addresses to be same in lgrp_plat_node_memory[] 642a940d195Sjjc */ 643a940d195Sjjc lgrp_plat_node_memory[node].exists = 0; 644a940d195Sjjc lgrp_plat_node_memory[node].start = 645a940d195Sjjc lgrp_plat_node_memory[node].end = (pfn_t)-1; 646a940d195Sjjc continue; 647a940d195Sjjc } 648a940d195Sjjc 649a940d195Sjjc /* 6507c478bd9Sstevel@tonic-gate * Get PFN for first page in each node, 6517c478bd9Sstevel@tonic-gate * so we can probe memory to determine latency topology 6527c478bd9Sstevel@tonic-gate */ 6537c478bd9Sstevel@tonic-gate lgrp_plat_probe_pfn[node] = 654*f78a91cdSjjc btop(OPT_DRAMADDR(base_hi, base_lo)); 6557c478bd9Sstevel@tonic-gate 6567c478bd9Sstevel@tonic-gate /* 657a940d195Sjjc * Mark node memory as existing and remember physical address 658a940d195Sjjc * range of each node for use later 6597c478bd9Sstevel@tonic-gate */ 660a940d195Sjjc lgrp_plat_node_memory[node].exists = 1; 661*f78a91cdSjjc 6627c478bd9Sstevel@tonic-gate lgrp_plat_node_memory[node].start = 663*f78a91cdSjjc btop(OPT_DRAMADDR(base_hi, base_lo)); 664*f78a91cdSjjc 6657c478bd9Sstevel@tonic-gate lgrp_plat_node_memory[node].end = 666*f78a91cdSjjc btop(OPT_DRAMADDR(limit_hi, limit_lo) | 667*f78a91cdSjjc OPT_DRAMADDR_LO_MASK_OFF); 668*f78a91cdSjjc } 669*f78a91cdSjjc 670*f78a91cdSjjc /* 671*f78a91cdSjjc * Restore PCI Extended Configuration Space enable bit 672*f78a91cdSjjc */ 673*f78a91cdSjjc if (opt_family == AMD_FAMILY_GREYHOUND) { 674*f78a91cdSjjc if ((nb_cfg_reg & AMD_GH_NB_CFG_EN_ECS) == 0) 675*f78a91cdSjjc wrmsr(MSR_AMD_NB_CFG, nb_cfg_reg); 6767c478bd9Sstevel@tonic-gate } 6777c478bd9Sstevel@tonic-gate 6787c478bd9Sstevel@tonic-gate /* 6797c478bd9Sstevel@tonic-gate * Only use one memory node if memory is interleaved between any nodes 6807c478bd9Sstevel@tonic-gate */ 6817c478bd9Sstevel@tonic-gate if (lgrp_plat_mem_intrlv) { 6827c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt = max_mem_nodes = 1; 6837c478bd9Sstevel@tonic-gate (void) lgrp_topo_ht_limit_set(1); 6847c478bd9Sstevel@tonic-gate } else { 6857c478bd9Sstevel@tonic-gate max_mem_nodes = lgrp_plat_node_cnt; 6867c478bd9Sstevel@tonic-gate 6877c478bd9Sstevel@tonic-gate /* 6887c478bd9Sstevel@tonic-gate * Probing errors can mess up the lgroup topology and force us 6897c478bd9Sstevel@tonic-gate * fall back to a 2 level lgroup topology. Here we bound how 6907c478bd9Sstevel@tonic-gate * tall the lgroup topology can grow in hopes of avoiding any 6917c478bd9Sstevel@tonic-gate * anamolies in probing from messing up the lgroup topology 6927c478bd9Sstevel@tonic-gate * by limiting the accuracy of the latency topology. 6937c478bd9Sstevel@tonic-gate * 6947c478bd9Sstevel@tonic-gate * Assume that nodes will at least be configured in a ring, 6957c478bd9Sstevel@tonic-gate * so limit height of lgroup topology to be less than number 6967c478bd9Sstevel@tonic-gate * of nodes on a system with 4 or more nodes 6977c478bd9Sstevel@tonic-gate */ 6987c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt >= 4 && 6997c478bd9Sstevel@tonic-gate lgrp_topo_ht_limit() == lgrp_topo_ht_limit_default()) 7007c478bd9Sstevel@tonic-gate (void) lgrp_topo_ht_limit_set(lgrp_plat_node_cnt - 1); 7017c478bd9Sstevel@tonic-gate } 7027c478bd9Sstevel@tonic-gate 7037c478bd9Sstevel@tonic-gate /* 7047c478bd9Sstevel@tonic-gate * Lgroups on Opteron architectures have but a single physical 7057c478bd9Sstevel@tonic-gate * processor. Tune lgrp_expand_proc_thresh and lgrp_expand_proc_diff 7067c478bd9Sstevel@tonic-gate * so that lgrp_choose() will spread things out aggressively. 7077c478bd9Sstevel@tonic-gate */ 7087c478bd9Sstevel@tonic-gate lgrp_expand_proc_thresh = LGRP_LOADAVG_THREAD_MAX / 2; 7097c478bd9Sstevel@tonic-gate lgrp_expand_proc_diff = 0; 7107c478bd9Sstevel@tonic-gate } 7117c478bd9Sstevel@tonic-gate 7127c478bd9Sstevel@tonic-gate 7137c478bd9Sstevel@tonic-gate /* 7147c478bd9Sstevel@tonic-gate * Latencies must be within 1/(2**LGRP_LAT_TOLERANCE_SHIFT) of each other to 7157c478bd9Sstevel@tonic-gate * be considered same 7167c478bd9Sstevel@tonic-gate */ 7177c478bd9Sstevel@tonic-gate #define LGRP_LAT_TOLERANCE_SHIFT 4 7187c478bd9Sstevel@tonic-gate 7197c478bd9Sstevel@tonic-gate int lgrp_plat_probe_lt_shift = LGRP_LAT_TOLERANCE_SHIFT; 7207c478bd9Sstevel@tonic-gate 7217c478bd9Sstevel@tonic-gate 7227c478bd9Sstevel@tonic-gate /* 7237c478bd9Sstevel@tonic-gate * Adjust latencies between nodes to be symmetric, normalize latencies between 7247c478bd9Sstevel@tonic-gate * any nodes that are within some tolerance to be same, and make local 7257c478bd9Sstevel@tonic-gate * latencies be same 7267c478bd9Sstevel@tonic-gate */ 7277c478bd9Sstevel@tonic-gate static void 7287c478bd9Sstevel@tonic-gate lgrp_plat_latency_adjust(void) 7297c478bd9Sstevel@tonic-gate { 7307c478bd9Sstevel@tonic-gate int i; 7317c478bd9Sstevel@tonic-gate int j; 7327c478bd9Sstevel@tonic-gate int k; 7337c478bd9Sstevel@tonic-gate int l; 7347c478bd9Sstevel@tonic-gate u_longlong_t max; 7357c478bd9Sstevel@tonic-gate u_longlong_t min; 7367c478bd9Sstevel@tonic-gate u_longlong_t t; 7377c478bd9Sstevel@tonic-gate u_longlong_t t1; 7387c478bd9Sstevel@tonic-gate u_longlong_t t2; 73903400a71Sjjc const lgrp_config_flag_t cflag = LGRP_CONFIG_LAT_CHANGE_ALL; 7407c478bd9Sstevel@tonic-gate int lat_corrected[MAX_NODES][MAX_NODES]; 7417c478bd9Sstevel@tonic-gate 7427c478bd9Sstevel@tonic-gate /* 7437c478bd9Sstevel@tonic-gate * Nothing to do when this is an UMA machine 7447c478bd9Sstevel@tonic-gate */ 7457c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 7467c478bd9Sstevel@tonic-gate return; 7477c478bd9Sstevel@tonic-gate 7487c478bd9Sstevel@tonic-gate /* 7497c478bd9Sstevel@tonic-gate * Make sure that latencies are symmetric between any two nodes 7507c478bd9Sstevel@tonic-gate * (ie. latency(node0, node1) == latency(node1, node0)) 7517c478bd9Sstevel@tonic-gate */ 7527c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 7537c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 7547c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 7557c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[j][i]; 7567c478bd9Sstevel@tonic-gate 7577c478bd9Sstevel@tonic-gate if (t1 == 0 || t2 == 0 || t1 == t2) 7587c478bd9Sstevel@tonic-gate continue; 7597c478bd9Sstevel@tonic-gate 7607c478bd9Sstevel@tonic-gate /* 7617c478bd9Sstevel@tonic-gate * Latencies should be same 7627c478bd9Sstevel@tonic-gate * - Use minimum of two latencies which should be same 7637c478bd9Sstevel@tonic-gate * - Track suspect probe times not within tolerance of 7647c478bd9Sstevel@tonic-gate * min value 7657c478bd9Sstevel@tonic-gate * - Remember how much values are corrected by 7667c478bd9Sstevel@tonic-gate */ 7677c478bd9Sstevel@tonic-gate if (t1 > t2) { 7687c478bd9Sstevel@tonic-gate t = t2; 7697c478bd9Sstevel@tonic-gate lgrp_plat_probe_errors[i][j] += t1 - t2; 7707c478bd9Sstevel@tonic-gate if (t1 - t2 > t2 >> lgrp_plat_probe_lt_shift) { 7717c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[i][j]++; 7727c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[j][i]++; 7737c478bd9Sstevel@tonic-gate } 7747c478bd9Sstevel@tonic-gate } else if (t2 > t1) { 7757c478bd9Sstevel@tonic-gate t = t1; 7767c478bd9Sstevel@tonic-gate lgrp_plat_probe_errors[j][i] += t2 - t1; 7777c478bd9Sstevel@tonic-gate if (t2 - t1 > t1 >> lgrp_plat_probe_lt_shift) { 7787c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[i][j]++; 7797c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[j][i]++; 7807c478bd9Sstevel@tonic-gate } 7817c478bd9Sstevel@tonic-gate } 7827c478bd9Sstevel@tonic-gate 7837c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 7847c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[j][i] = t; 7857c478bd9Sstevel@tonic-gate lgrp_config(cflag, t1, t); 7867c478bd9Sstevel@tonic-gate lgrp_config(cflag, t2, t); 7877c478bd9Sstevel@tonic-gate } 7887c478bd9Sstevel@tonic-gate 7897c478bd9Sstevel@tonic-gate /* 7907c478bd9Sstevel@tonic-gate * Keep track of which latencies get corrected 7917c478bd9Sstevel@tonic-gate */ 7927c478bd9Sstevel@tonic-gate for (i = 0; i < MAX_NODES; i++) 7937c478bd9Sstevel@tonic-gate for (j = 0; j < MAX_NODES; j++) 7947c478bd9Sstevel@tonic-gate lat_corrected[i][j] = 0; 7957c478bd9Sstevel@tonic-gate 7967c478bd9Sstevel@tonic-gate /* 7977c478bd9Sstevel@tonic-gate * For every two nodes, see whether there is another pair of nodes which 7987c478bd9Sstevel@tonic-gate * are about the same distance apart and make the latencies be the same 7997c478bd9Sstevel@tonic-gate * if they are close enough together 8007c478bd9Sstevel@tonic-gate */ 8017c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 8027c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 8037c478bd9Sstevel@tonic-gate /* 8047c478bd9Sstevel@tonic-gate * Pick one pair of nodes (i, j) 8057c478bd9Sstevel@tonic-gate * and get latency between them 8067c478bd9Sstevel@tonic-gate */ 8077c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 8087c478bd9Sstevel@tonic-gate 8097c478bd9Sstevel@tonic-gate /* 8107c478bd9Sstevel@tonic-gate * Skip this pair of nodes if there isn't a latency 8117c478bd9Sstevel@tonic-gate * for it yet 8127c478bd9Sstevel@tonic-gate */ 8137c478bd9Sstevel@tonic-gate if (t1 == 0) 8147c478bd9Sstevel@tonic-gate continue; 8157c478bd9Sstevel@tonic-gate 8167c478bd9Sstevel@tonic-gate for (k = 0; k < lgrp_plat_node_cnt; k++) 8177c478bd9Sstevel@tonic-gate for (l = 0; l < lgrp_plat_node_cnt; l++) { 8187c478bd9Sstevel@tonic-gate /* 8197c478bd9Sstevel@tonic-gate * Pick another pair of nodes (k, l) 8207c478bd9Sstevel@tonic-gate * not same as (i, j) and get latency 8217c478bd9Sstevel@tonic-gate * between them 8227c478bd9Sstevel@tonic-gate */ 8237c478bd9Sstevel@tonic-gate if (k == i && l == j) 8247c478bd9Sstevel@tonic-gate continue; 8257c478bd9Sstevel@tonic-gate 8267c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[k][l]; 8277c478bd9Sstevel@tonic-gate 8287c478bd9Sstevel@tonic-gate /* 8297c478bd9Sstevel@tonic-gate * Skip this pair of nodes if there 8307c478bd9Sstevel@tonic-gate * isn't a latency for it yet 8317c478bd9Sstevel@tonic-gate */ 8327c478bd9Sstevel@tonic-gate 8337c478bd9Sstevel@tonic-gate if (t2 == 0) 8347c478bd9Sstevel@tonic-gate continue; 8357c478bd9Sstevel@tonic-gate 8367c478bd9Sstevel@tonic-gate /* 8377c478bd9Sstevel@tonic-gate * Skip nodes (k, l) if they already 8387c478bd9Sstevel@tonic-gate * have same latency as (i, j) or 8397c478bd9Sstevel@tonic-gate * their latency isn't close enough to 8407c478bd9Sstevel@tonic-gate * be considered/made the same 8417c478bd9Sstevel@tonic-gate */ 8427c478bd9Sstevel@tonic-gate if (t1 == t2 || (t1 > t2 && t1 - t2 > 8437c478bd9Sstevel@tonic-gate t1 >> lgrp_plat_probe_lt_shift) || 8447c478bd9Sstevel@tonic-gate (t2 > t1 && t2 - t1 > 8457c478bd9Sstevel@tonic-gate t2 >> lgrp_plat_probe_lt_shift)) 8467c478bd9Sstevel@tonic-gate continue; 8477c478bd9Sstevel@tonic-gate 8487c478bd9Sstevel@tonic-gate /* 8497c478bd9Sstevel@tonic-gate * Make latency(i, j) same as 8507c478bd9Sstevel@tonic-gate * latency(k, l), try to use latency 8517c478bd9Sstevel@tonic-gate * that has been adjusted already to get 8527c478bd9Sstevel@tonic-gate * more consistency (if possible), and 8537c478bd9Sstevel@tonic-gate * remember which latencies were 8547c478bd9Sstevel@tonic-gate * adjusted for next time 8557c478bd9Sstevel@tonic-gate */ 8567c478bd9Sstevel@tonic-gate if (lat_corrected[i][j]) { 8577c478bd9Sstevel@tonic-gate t = t1; 8587c478bd9Sstevel@tonic-gate lgrp_config(cflag, t2, t); 8597c478bd9Sstevel@tonic-gate t2 = t; 8607c478bd9Sstevel@tonic-gate } else if (lat_corrected[k][l]) { 8617c478bd9Sstevel@tonic-gate t = t2; 8627c478bd9Sstevel@tonic-gate lgrp_config(cflag, t1, t); 8637c478bd9Sstevel@tonic-gate t1 = t; 8647c478bd9Sstevel@tonic-gate } else { 8657c478bd9Sstevel@tonic-gate if (t1 > t2) 8667c478bd9Sstevel@tonic-gate t = t2; 8677c478bd9Sstevel@tonic-gate else 8687c478bd9Sstevel@tonic-gate t = t1; 8697c478bd9Sstevel@tonic-gate lgrp_config(cflag, t1, t); 8707c478bd9Sstevel@tonic-gate lgrp_config(cflag, t2, t); 8717c478bd9Sstevel@tonic-gate t1 = t2 = t; 8727c478bd9Sstevel@tonic-gate } 8737c478bd9Sstevel@tonic-gate 8747c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 8757c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[k][l] = t; 8767c478bd9Sstevel@tonic-gate 8777c478bd9Sstevel@tonic-gate lat_corrected[i][j] = 8787c478bd9Sstevel@tonic-gate lat_corrected[k][l] = 1; 8797c478bd9Sstevel@tonic-gate } 8807c478bd9Sstevel@tonic-gate } 8817c478bd9Sstevel@tonic-gate 8827c478bd9Sstevel@tonic-gate /* 8837c478bd9Sstevel@tonic-gate * Local latencies should be same 8847c478bd9Sstevel@tonic-gate * - Find min and max local latencies 8857c478bd9Sstevel@tonic-gate * - Make all local latencies be minimum 8867c478bd9Sstevel@tonic-gate */ 8877c478bd9Sstevel@tonic-gate min = -1; 8887c478bd9Sstevel@tonic-gate max = 0; 8897c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 8907c478bd9Sstevel@tonic-gate t = lgrp_plat_probe_times[i][i]; 8917c478bd9Sstevel@tonic-gate if (t == 0) 8927c478bd9Sstevel@tonic-gate continue; 8937c478bd9Sstevel@tonic-gate if (min == -1 || t < min) 8947c478bd9Sstevel@tonic-gate min = t; 8957c478bd9Sstevel@tonic-gate if (t > max) 8967c478bd9Sstevel@tonic-gate max = t; 8977c478bd9Sstevel@tonic-gate } 8987c478bd9Sstevel@tonic-gate if (min != max) { 8997c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 9007c478bd9Sstevel@tonic-gate int local; 9017c478bd9Sstevel@tonic-gate 9027c478bd9Sstevel@tonic-gate local = lgrp_plat_probe_times[i][i]; 9037c478bd9Sstevel@tonic-gate if (local == 0) 9047c478bd9Sstevel@tonic-gate continue; 9057c478bd9Sstevel@tonic-gate 9067c478bd9Sstevel@tonic-gate /* 9077c478bd9Sstevel@tonic-gate * Track suspect probe times that aren't within 9087c478bd9Sstevel@tonic-gate * tolerance of minimum local latency and how much 9097c478bd9Sstevel@tonic-gate * probe times are corrected by 9107c478bd9Sstevel@tonic-gate */ 9117c478bd9Sstevel@tonic-gate if (local - min > min >> lgrp_plat_probe_lt_shift) 9127c478bd9Sstevel@tonic-gate lgrp_plat_probe_suspect[i][i]++; 9137c478bd9Sstevel@tonic-gate 9147c478bd9Sstevel@tonic-gate lgrp_plat_probe_errors[i][i] += local - min; 9157c478bd9Sstevel@tonic-gate 9167c478bd9Sstevel@tonic-gate /* 9177c478bd9Sstevel@tonic-gate * Make local latencies be minimum 9187c478bd9Sstevel@tonic-gate */ 91903400a71Sjjc lgrp_config(LGRP_CONFIG_LAT_CHANGE, i, min); 9207c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][i] = min; 9217c478bd9Sstevel@tonic-gate } 9227c478bd9Sstevel@tonic-gate } 9237c478bd9Sstevel@tonic-gate 9247c478bd9Sstevel@tonic-gate /* 9257c478bd9Sstevel@tonic-gate * Determine max probe time again since just adjusted latencies 9267c478bd9Sstevel@tonic-gate */ 9277c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = 0; 9287c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 9297c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 9307c478bd9Sstevel@tonic-gate t = lgrp_plat_probe_times[i][j]; 9317c478bd9Sstevel@tonic-gate if (t > lgrp_plat_probe_time_max) 9327c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = t; 9337c478bd9Sstevel@tonic-gate } 9347c478bd9Sstevel@tonic-gate } 9357c478bd9Sstevel@tonic-gate 9367c478bd9Sstevel@tonic-gate 9377c478bd9Sstevel@tonic-gate /* 9387c478bd9Sstevel@tonic-gate * Verify following about latencies between nodes: 9397c478bd9Sstevel@tonic-gate * 9407c478bd9Sstevel@tonic-gate * - Latencies should be symmetric (ie. latency(a, b) == latency(b, a)) 9417c478bd9Sstevel@tonic-gate * - Local latencies same 9427c478bd9Sstevel@tonic-gate * - Local < remote 9437c478bd9Sstevel@tonic-gate * - Number of latencies seen is reasonable 9447c478bd9Sstevel@tonic-gate * - Number of occurrences of a given latency should be more than 1 9457c478bd9Sstevel@tonic-gate * 9467c478bd9Sstevel@tonic-gate * Returns: 9477c478bd9Sstevel@tonic-gate * 0 Success 9487c478bd9Sstevel@tonic-gate * -1 Not symmetric 9497c478bd9Sstevel@tonic-gate * -2 Local latencies not same 9507c478bd9Sstevel@tonic-gate * -3 Local >= remote 9517c478bd9Sstevel@tonic-gate * -4 Wrong number of latencies 9527c478bd9Sstevel@tonic-gate * -5 Not enough occurrences of given latency 9537c478bd9Sstevel@tonic-gate */ 9547c478bd9Sstevel@tonic-gate static int 9557c478bd9Sstevel@tonic-gate lgrp_plat_latency_verify(void) 9567c478bd9Sstevel@tonic-gate { 9577c478bd9Sstevel@tonic-gate int i; 9587c478bd9Sstevel@tonic-gate int j; 9597c478bd9Sstevel@tonic-gate lgrp_plat_latency_acct_t *l; 9607c478bd9Sstevel@tonic-gate int probed; 9617c478bd9Sstevel@tonic-gate u_longlong_t t1; 9627c478bd9Sstevel@tonic-gate u_longlong_t t2; 9637c478bd9Sstevel@tonic-gate 9647c478bd9Sstevel@tonic-gate /* 9652dae3fb5Sjjc * Nothing to do when this is an UMA machine, lgroup topology is 9662dae3fb5Sjjc * limited to 2 levels, or there aren't any probe times yet 9677c478bd9Sstevel@tonic-gate */ 9687c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1 || lgrp_topo_levels < 2 || 9692dae3fb5Sjjc (lgrp_plat_probe_time_max == 0 && lgrp_plat_probe_time_min == -1)) 9707c478bd9Sstevel@tonic-gate return (0); 9717c478bd9Sstevel@tonic-gate 9727c478bd9Sstevel@tonic-gate /* 9737c478bd9Sstevel@tonic-gate * Make sure that latencies are symmetric between any two nodes 9747c478bd9Sstevel@tonic-gate * (ie. latency(node0, node1) == latency(node1, node0)) 9757c478bd9Sstevel@tonic-gate */ 9767c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 9777c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 9787c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 9797c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[j][i]; 9807c478bd9Sstevel@tonic-gate 9817c478bd9Sstevel@tonic-gate if (t1 == 0 || t2 == 0 || t1 == t2) 9827c478bd9Sstevel@tonic-gate continue; 9837c478bd9Sstevel@tonic-gate 9847c478bd9Sstevel@tonic-gate return (-1); 9857c478bd9Sstevel@tonic-gate } 9867c478bd9Sstevel@tonic-gate 9877c478bd9Sstevel@tonic-gate /* 9887c478bd9Sstevel@tonic-gate * Local latencies should be same 9897c478bd9Sstevel@tonic-gate */ 9907c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[0][0]; 9917c478bd9Sstevel@tonic-gate for (i = 1; i < lgrp_plat_node_cnt; i++) { 9927c478bd9Sstevel@tonic-gate t2 = lgrp_plat_probe_times[i][i]; 9937c478bd9Sstevel@tonic-gate if (t2 == 0) 9947c478bd9Sstevel@tonic-gate continue; 9957c478bd9Sstevel@tonic-gate 9962dae3fb5Sjjc if (t1 == 0) { 9972dae3fb5Sjjc t1 = t2; 9982dae3fb5Sjjc continue; 9992dae3fb5Sjjc } 10002dae3fb5Sjjc 10017c478bd9Sstevel@tonic-gate if (t1 != t2) 10027c478bd9Sstevel@tonic-gate return (-2); 10037c478bd9Sstevel@tonic-gate } 10047c478bd9Sstevel@tonic-gate 10057c478bd9Sstevel@tonic-gate /* 10067c478bd9Sstevel@tonic-gate * Local latencies should be less than remote 10077c478bd9Sstevel@tonic-gate */ 10082dae3fb5Sjjc if (t1) { 10097c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 10107c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 10112dae3fb5Sjjc t2 = lgrp_plat_probe_times[i][j]; 10127c478bd9Sstevel@tonic-gate if (i == j || t2 == 0) 10137c478bd9Sstevel@tonic-gate continue; 10147c478bd9Sstevel@tonic-gate 10157c478bd9Sstevel@tonic-gate if (t1 >= t2) 10167c478bd9Sstevel@tonic-gate return (-3); 10177c478bd9Sstevel@tonic-gate } 10182dae3fb5Sjjc } 10197c478bd9Sstevel@tonic-gate 10207c478bd9Sstevel@tonic-gate /* 10217c478bd9Sstevel@tonic-gate * Rest of checks are not very useful for machines with less than 10227c478bd9Sstevel@tonic-gate * 4 nodes (which means less than 3 latencies on Opteron) 10237c478bd9Sstevel@tonic-gate */ 10247c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt < 4) 10257c478bd9Sstevel@tonic-gate return (0); 10267c478bd9Sstevel@tonic-gate 10277c478bd9Sstevel@tonic-gate /* 10287c478bd9Sstevel@tonic-gate * Need to see whether done probing in order to verify number of 10297c478bd9Sstevel@tonic-gate * latencies are correct 10307c478bd9Sstevel@tonic-gate */ 10317c478bd9Sstevel@tonic-gate probed = 0; 10327c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) 10337c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[i][i]) 10347c478bd9Sstevel@tonic-gate probed++; 10357c478bd9Sstevel@tonic-gate 10367c478bd9Sstevel@tonic-gate if (probed != lgrp_plat_node_cnt) 10377c478bd9Sstevel@tonic-gate return (0); 10387c478bd9Sstevel@tonic-gate 10397c478bd9Sstevel@tonic-gate /* 10407c478bd9Sstevel@tonic-gate * Determine number of unique latencies seen in probe times, 10417c478bd9Sstevel@tonic-gate * their values, and number of occurrences of each 10427c478bd9Sstevel@tonic-gate */ 10437c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies = 0; 10447c478bd9Sstevel@tonic-gate bzero(lgrp_plat_probe_lat_acct, 10457c478bd9Sstevel@tonic-gate MAX_NODES * sizeof (lgrp_plat_latency_acct_t)); 10467c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 10477c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 10487c478bd9Sstevel@tonic-gate int k; 10497c478bd9Sstevel@tonic-gate 10507c478bd9Sstevel@tonic-gate /* 10517c478bd9Sstevel@tonic-gate * Look at each probe time 10527c478bd9Sstevel@tonic-gate */ 10537c478bd9Sstevel@tonic-gate t1 = lgrp_plat_probe_times[i][j]; 10547c478bd9Sstevel@tonic-gate if (t1 == 0) 10557c478bd9Sstevel@tonic-gate continue; 10567c478bd9Sstevel@tonic-gate 10577c478bd9Sstevel@tonic-gate /* 10587c478bd9Sstevel@tonic-gate * Account for unique latencies 10597c478bd9Sstevel@tonic-gate */ 10607c478bd9Sstevel@tonic-gate for (k = 0; k < lgrp_plat_node_cnt; k++) { 10617c478bd9Sstevel@tonic-gate l = &lgrp_plat_probe_lat_acct[k]; 10627c478bd9Sstevel@tonic-gate if (t1 == l->la_value) { 10637c478bd9Sstevel@tonic-gate /* 10647c478bd9Sstevel@tonic-gate * Increment number of occurrences 10657c478bd9Sstevel@tonic-gate * if seen before 10667c478bd9Sstevel@tonic-gate */ 10677c478bd9Sstevel@tonic-gate l->la_count++; 10687c478bd9Sstevel@tonic-gate break; 10697c478bd9Sstevel@tonic-gate } else if (l->la_value == 0) { 10707c478bd9Sstevel@tonic-gate /* 10717c478bd9Sstevel@tonic-gate * Record latency if haven't seen before 10727c478bd9Sstevel@tonic-gate */ 10737c478bd9Sstevel@tonic-gate l->la_value = t1; 10747c478bd9Sstevel@tonic-gate l->la_count++; 10757c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies++; 10767c478bd9Sstevel@tonic-gate break; 10777c478bd9Sstevel@tonic-gate } 10787c478bd9Sstevel@tonic-gate } 10797c478bd9Sstevel@tonic-gate } 10807c478bd9Sstevel@tonic-gate } 10817c478bd9Sstevel@tonic-gate 10827c478bd9Sstevel@tonic-gate /* 10837c478bd9Sstevel@tonic-gate * Number of latencies should be relative to number of 10847c478bd9Sstevel@tonic-gate * nodes in system: 10857c478bd9Sstevel@tonic-gate * - Same as nodes when nodes <= 2 10867c478bd9Sstevel@tonic-gate * - Less than nodes when nodes > 2 10877c478bd9Sstevel@tonic-gate * - Greater than 2 when nodes >= 4 10887c478bd9Sstevel@tonic-gate */ 10897c478bd9Sstevel@tonic-gate if ((lgrp_plat_node_cnt <= 2 && 10907c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies != lgrp_plat_node_cnt) || 10917c478bd9Sstevel@tonic-gate (lgrp_plat_node_cnt > 2 && 10927c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies >= lgrp_plat_node_cnt) || 10937c478bd9Sstevel@tonic-gate (lgrp_plat_node_cnt >= 4 && lgrp_topo_levels >= 3 && 10947c478bd9Sstevel@tonic-gate lgrp_plat_probe_nlatencies <= 2)) 10957c478bd9Sstevel@tonic-gate return (-4); 10967c478bd9Sstevel@tonic-gate 10977c478bd9Sstevel@tonic-gate /* 10987c478bd9Sstevel@tonic-gate * There should be more than one occurrence of every latency 10997c478bd9Sstevel@tonic-gate * as long as probing is complete 11007c478bd9Sstevel@tonic-gate */ 11017c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_probe_nlatencies; i++) { 11027c478bd9Sstevel@tonic-gate l = &lgrp_plat_probe_lat_acct[i]; 11037c478bd9Sstevel@tonic-gate if (l->la_count <= 1) 11047c478bd9Sstevel@tonic-gate return (-5); 11057c478bd9Sstevel@tonic-gate } 11067c478bd9Sstevel@tonic-gate return (0); 11077c478bd9Sstevel@tonic-gate } 11087c478bd9Sstevel@tonic-gate 11097c478bd9Sstevel@tonic-gate 11107c478bd9Sstevel@tonic-gate /* 11117c478bd9Sstevel@tonic-gate * Set lgroup latencies for 2 level lgroup topology 11127c478bd9Sstevel@tonic-gate */ 11137c478bd9Sstevel@tonic-gate static void 11147c478bd9Sstevel@tonic-gate lgrp_plat_2level_setup(void) 11157c478bd9Sstevel@tonic-gate { 11167c478bd9Sstevel@tonic-gate int i; 11177c478bd9Sstevel@tonic-gate 11187c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt >= 4) 11197c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 11207c478bd9Sstevel@tonic-gate "MPO only optimizing for local and remote\n"); 11217c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 11227c478bd9Sstevel@tonic-gate int j; 11237c478bd9Sstevel@tonic-gate 11247c478bd9Sstevel@tonic-gate for (j = 0; j < lgrp_plat_node_cnt; j++) { 11257c478bd9Sstevel@tonic-gate if (i == j) 11267c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 2; 11277c478bd9Sstevel@tonic-gate else 11287c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[i][j] = 3; 11297c478bd9Sstevel@tonic-gate } 11307c478bd9Sstevel@tonic-gate } 11317c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_min = 2; 11327c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = 3; 11337c478bd9Sstevel@tonic-gate lgrp_config(LGRP_CONFIG_FLATTEN, 2, 0); 11347c478bd9Sstevel@tonic-gate } 11357c478bd9Sstevel@tonic-gate 11367c478bd9Sstevel@tonic-gate 11377c478bd9Sstevel@tonic-gate /* 11387c478bd9Sstevel@tonic-gate * Return time needed to probe from current CPU to memory in given node 11397c478bd9Sstevel@tonic-gate */ 11407c478bd9Sstevel@tonic-gate static hrtime_t 11417c478bd9Sstevel@tonic-gate lgrp_plat_probe_time(int to) 11427c478bd9Sstevel@tonic-gate { 11437c478bd9Sstevel@tonic-gate caddr_t buf; 11447c478bd9Sstevel@tonic-gate uint_t dev; 11457c478bd9Sstevel@tonic-gate /* LINTED: set but not used in function */ 11467c478bd9Sstevel@tonic-gate volatile uint_t dev_vendor; 11477c478bd9Sstevel@tonic-gate hrtime_t elapsed; 11487c478bd9Sstevel@tonic-gate hrtime_t end; 11497c478bd9Sstevel@tonic-gate int from; 11507c478bd9Sstevel@tonic-gate int i; 11517c478bd9Sstevel@tonic-gate int ipl; 11527c478bd9Sstevel@tonic-gate hrtime_t max; 11537c478bd9Sstevel@tonic-gate hrtime_t min; 11547c478bd9Sstevel@tonic-gate hrtime_t start; 11558949bcd6Sandrei int cnt; 11567c478bd9Sstevel@tonic-gate extern int use_sse_pagecopy; 11577c478bd9Sstevel@tonic-gate 11587c478bd9Sstevel@tonic-gate /* 11597c478bd9Sstevel@tonic-gate * Determine ID of node containing current CPU 11607c478bd9Sstevel@tonic-gate */ 11617c478bd9Sstevel@tonic-gate from = LGRP_PLAT_CPU_TO_NODE(CPU); 11627c478bd9Sstevel@tonic-gate 11637c478bd9Sstevel@tonic-gate /* 11647c478bd9Sstevel@tonic-gate * Do common work for probing main memory 11657c478bd9Sstevel@tonic-gate */ 11667c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_PGCPY) { 11677c478bd9Sstevel@tonic-gate /* 11687c478bd9Sstevel@tonic-gate * Skip probing any nodes without memory and 11697c478bd9Sstevel@tonic-gate * set probe time to 0 11707c478bd9Sstevel@tonic-gate */ 11717c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_memory[to] == NULL) { 11727c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[from][to] = 0; 11737c478bd9Sstevel@tonic-gate return (0); 11747c478bd9Sstevel@tonic-gate } 11757c478bd9Sstevel@tonic-gate 11767c478bd9Sstevel@tonic-gate /* 11777c478bd9Sstevel@tonic-gate * Invalidate caches once instead of once every sample 11787c478bd9Sstevel@tonic-gate * which should cut cost of probing by a lot 11797c478bd9Sstevel@tonic-gate */ 11807c478bd9Sstevel@tonic-gate lgrp_plat_flush_cost = gethrtime(); 11817c478bd9Sstevel@tonic-gate invalidate_cache(); 11827c478bd9Sstevel@tonic-gate lgrp_plat_flush_cost = gethrtime() - lgrp_plat_flush_cost; 11837c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost_total += lgrp_plat_flush_cost; 11847c478bd9Sstevel@tonic-gate } 11857c478bd9Sstevel@tonic-gate 11867c478bd9Sstevel@tonic-gate /* 11877c478bd9Sstevel@tonic-gate * Probe from current CPU to given memory using specified operation 11887c478bd9Sstevel@tonic-gate * and take specified number of samples 11897c478bd9Sstevel@tonic-gate */ 11907c478bd9Sstevel@tonic-gate max = 0; 11917c478bd9Sstevel@tonic-gate min = -1; 11927c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_probe_nsamples; i++) { 11937c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost = gethrtime(); 11947c478bd9Sstevel@tonic-gate 11957c478bd9Sstevel@tonic-gate /* 11967c478bd9Sstevel@tonic-gate * Can't measure probe time if gethrtime() isn't working yet 11977c478bd9Sstevel@tonic-gate */ 11987c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_cost == 0 && gethrtime() == 0) 11997c478bd9Sstevel@tonic-gate return (0); 12007c478bd9Sstevel@tonic-gate 12017c478bd9Sstevel@tonic-gate switch (lgrp_plat_probe_op) { 12027c478bd9Sstevel@tonic-gate 12037c478bd9Sstevel@tonic-gate case LGRP_PLAT_PROBE_PGCPY: 12047c478bd9Sstevel@tonic-gate default: 12057c478bd9Sstevel@tonic-gate /* 12067c478bd9Sstevel@tonic-gate * Measure how long it takes to copy page 12077c478bd9Sstevel@tonic-gate * on top of itself 12087c478bd9Sstevel@tonic-gate */ 12097c478bd9Sstevel@tonic-gate buf = lgrp_plat_probe_memory[to] + (i * PAGESIZE); 12107c478bd9Sstevel@tonic-gate 12117c478bd9Sstevel@tonic-gate kpreempt_disable(); 12127c478bd9Sstevel@tonic-gate ipl = splhigh(); 12137c478bd9Sstevel@tonic-gate start = gethrtime(); 12147c478bd9Sstevel@tonic-gate if (use_sse_pagecopy) 12157c478bd9Sstevel@tonic-gate hwblkpagecopy(buf, buf); 12167c478bd9Sstevel@tonic-gate else 12177c478bd9Sstevel@tonic-gate bcopy(buf, buf, PAGESIZE); 12187c478bd9Sstevel@tonic-gate end = gethrtime(); 12197c478bd9Sstevel@tonic-gate elapsed = end - start; 12207c478bd9Sstevel@tonic-gate splx(ipl); 12217c478bd9Sstevel@tonic-gate kpreempt_enable(); 12227c478bd9Sstevel@tonic-gate break; 12237c478bd9Sstevel@tonic-gate 12247c478bd9Sstevel@tonic-gate case LGRP_PLAT_PROBE_VENDOR: 12257c478bd9Sstevel@tonic-gate /* 12267c478bd9Sstevel@tonic-gate * Measure how long it takes to read vendor ID from 12277c478bd9Sstevel@tonic-gate * Northbridge 12287c478bd9Sstevel@tonic-gate */ 12297c478bd9Sstevel@tonic-gate dev = OPT_PCS_DEV_NODE0 + to; 12307c478bd9Sstevel@tonic-gate kpreempt_disable(); 12317c478bd9Sstevel@tonic-gate ipl = spl8(); 12327c478bd9Sstevel@tonic-gate outl(PCI_CONFADD, PCI_CADDR1(0, dev, opt_probe_func, 12337c478bd9Sstevel@tonic-gate OPT_PCS_OFF_VENDOR)); 12347c478bd9Sstevel@tonic-gate start = gethrtime(); 12358949bcd6Sandrei for (cnt = 0; cnt < lgrp_plat_probe_nreads; cnt++) 12367c478bd9Sstevel@tonic-gate dev_vendor = inl(PCI_CONFDATA); 12377c478bd9Sstevel@tonic-gate end = gethrtime(); 12388949bcd6Sandrei elapsed = (end - start) / lgrp_plat_probe_nreads; 12397c478bd9Sstevel@tonic-gate splx(ipl); 12407c478bd9Sstevel@tonic-gate kpreempt_enable(); 12417c478bd9Sstevel@tonic-gate break; 12427c478bd9Sstevel@tonic-gate } 12437c478bd9Sstevel@tonic-gate 12447c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost = gethrtime() - lgrp_plat_probe_cost; 12457c478bd9Sstevel@tonic-gate lgrp_plat_probe_cost_total += lgrp_plat_probe_cost; 12467c478bd9Sstevel@tonic-gate 12477c478bd9Sstevel@tonic-gate if (min == -1 || elapsed < min) 12487c478bd9Sstevel@tonic-gate min = elapsed; 12497c478bd9Sstevel@tonic-gate if (elapsed > max) 12507c478bd9Sstevel@tonic-gate max = elapsed; 12517c478bd9Sstevel@tonic-gate } 12527c478bd9Sstevel@tonic-gate 12537c478bd9Sstevel@tonic-gate /* 12547c478bd9Sstevel@tonic-gate * Update minimum and maximum probe times between 12557c478bd9Sstevel@tonic-gate * these two nodes 12567c478bd9Sstevel@tonic-gate */ 12577c478bd9Sstevel@tonic-gate if (min < lgrp_plat_probe_min[from][to] || 12587c478bd9Sstevel@tonic-gate lgrp_plat_probe_min[from][to] == 0) 12597c478bd9Sstevel@tonic-gate lgrp_plat_probe_min[from][to] = min; 12607c478bd9Sstevel@tonic-gate 12617c478bd9Sstevel@tonic-gate if (max > lgrp_plat_probe_max[from][to]) 12627c478bd9Sstevel@tonic-gate lgrp_plat_probe_max[from][to] = max; 12637c478bd9Sstevel@tonic-gate 12647c478bd9Sstevel@tonic-gate return (min); 12657c478bd9Sstevel@tonic-gate } 12667c478bd9Sstevel@tonic-gate 12677c478bd9Sstevel@tonic-gate 12687c478bd9Sstevel@tonic-gate /* 12697c478bd9Sstevel@tonic-gate * Probe memory in each node from current CPU to determine latency topology 12707c478bd9Sstevel@tonic-gate */ 12717c478bd9Sstevel@tonic-gate void 12727c478bd9Sstevel@tonic-gate lgrp_plat_probe(void) 12737c478bd9Sstevel@tonic-gate { 12747c478bd9Sstevel@tonic-gate int from; 12757c478bd9Sstevel@tonic-gate int i; 12767c478bd9Sstevel@tonic-gate hrtime_t probe_time; 12777c478bd9Sstevel@tonic-gate int to; 12787c478bd9Sstevel@tonic-gate 12797c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1 || lgrp_topo_ht_limit() <= 2) 12807c478bd9Sstevel@tonic-gate return; 12817c478bd9Sstevel@tonic-gate 12827c478bd9Sstevel@tonic-gate /* 12837c478bd9Sstevel@tonic-gate * Determine ID of node containing current CPU 12847c478bd9Sstevel@tonic-gate */ 12857c478bd9Sstevel@tonic-gate from = LGRP_PLAT_CPU_TO_NODE(CPU); 12867c478bd9Sstevel@tonic-gate 12877c478bd9Sstevel@tonic-gate /* 12887c478bd9Sstevel@tonic-gate * Don't need to probe if got times already 12897c478bd9Sstevel@tonic-gate */ 12907c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[from][from] != 0) 12917c478bd9Sstevel@tonic-gate return; 12927c478bd9Sstevel@tonic-gate 12937c478bd9Sstevel@tonic-gate /* 12947c478bd9Sstevel@tonic-gate * Read vendor ID in Northbridge or read and write page(s) 12957c478bd9Sstevel@tonic-gate * in each node from current CPU and remember how long it takes, 12967c478bd9Sstevel@tonic-gate * so we can build latency topology of machine later. 12977c478bd9Sstevel@tonic-gate * This should approximate the memory latency between each node. 12987c478bd9Sstevel@tonic-gate */ 12997c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_probe_nrounds; i++) 13007c478bd9Sstevel@tonic-gate for (to = 0; to < lgrp_plat_node_cnt; to++) { 13017c478bd9Sstevel@tonic-gate /* 13027c478bd9Sstevel@tonic-gate * Get probe time and bail out if can't get it yet 13037c478bd9Sstevel@tonic-gate */ 13047c478bd9Sstevel@tonic-gate probe_time = lgrp_plat_probe_time(to); 13057c478bd9Sstevel@tonic-gate if (probe_time == 0) 13067c478bd9Sstevel@tonic-gate return; 13077c478bd9Sstevel@tonic-gate 13087c478bd9Sstevel@tonic-gate /* 13097c478bd9Sstevel@tonic-gate * Keep lowest probe time as latency between nodes 13107c478bd9Sstevel@tonic-gate */ 13117c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[from][to] == 0 || 13127c478bd9Sstevel@tonic-gate probe_time < lgrp_plat_probe_times[from][to]) 13137c478bd9Sstevel@tonic-gate lgrp_plat_probe_times[from][to] = probe_time; 13147c478bd9Sstevel@tonic-gate 13157c478bd9Sstevel@tonic-gate /* 13167c478bd9Sstevel@tonic-gate * Update overall minimum and maximum probe times 13177c478bd9Sstevel@tonic-gate * across all nodes 13187c478bd9Sstevel@tonic-gate */ 13197c478bd9Sstevel@tonic-gate if (probe_time < lgrp_plat_probe_time_min || 13207c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_min == -1) 13217c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_min = probe_time; 13227c478bd9Sstevel@tonic-gate if (probe_time > lgrp_plat_probe_time_max) 13237c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max = probe_time; 13247c478bd9Sstevel@tonic-gate } 13257c478bd9Sstevel@tonic-gate 13267c478bd9Sstevel@tonic-gate /* 13277c478bd9Sstevel@tonic-gate * - Fix up latencies such that local latencies are same, 13287c478bd9Sstevel@tonic-gate * latency(i, j) == latency(j, i), etc. (if possible) 13297c478bd9Sstevel@tonic-gate * 13307c478bd9Sstevel@tonic-gate * - Verify that latencies look ok 13317c478bd9Sstevel@tonic-gate * 13327c478bd9Sstevel@tonic-gate * - Fallback to just optimizing for local and remote if 13337c478bd9Sstevel@tonic-gate * latencies didn't look right 13347c478bd9Sstevel@tonic-gate */ 13357c478bd9Sstevel@tonic-gate lgrp_plat_latency_adjust(); 13367c478bd9Sstevel@tonic-gate lgrp_plat_probe_error_code = lgrp_plat_latency_verify(); 13377c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_error_code) 13387c478bd9Sstevel@tonic-gate lgrp_plat_2level_setup(); 13397c478bd9Sstevel@tonic-gate } 13407c478bd9Sstevel@tonic-gate 13417c478bd9Sstevel@tonic-gate 13427c478bd9Sstevel@tonic-gate /* 13437c478bd9Sstevel@tonic-gate * Platform-specific initialization 13447c478bd9Sstevel@tonic-gate */ 13457c478bd9Sstevel@tonic-gate void 13467c478bd9Sstevel@tonic-gate lgrp_plat_main_init(void) 13477c478bd9Sstevel@tonic-gate { 13487c478bd9Sstevel@tonic-gate int curnode; 13497c478bd9Sstevel@tonic-gate int ht_limit; 13507c478bd9Sstevel@tonic-gate int i; 13517c478bd9Sstevel@tonic-gate 13527c478bd9Sstevel@tonic-gate /* 13537c478bd9Sstevel@tonic-gate * Print a notice that MPO is disabled when memory is interleaved 13547c478bd9Sstevel@tonic-gate * across nodes....Would do this when it is discovered, but can't 13557c478bd9Sstevel@tonic-gate * because it happens way too early during boot.... 13567c478bd9Sstevel@tonic-gate */ 13577c478bd9Sstevel@tonic-gate if (lgrp_plat_mem_intrlv) 13587c478bd9Sstevel@tonic-gate cmn_err(CE_NOTE, 13597c478bd9Sstevel@tonic-gate "MPO disabled because memory is interleaved\n"); 13607c478bd9Sstevel@tonic-gate 13617c478bd9Sstevel@tonic-gate /* 13627c478bd9Sstevel@tonic-gate * Don't bother to do any probing if there is only one node or the 13637c478bd9Sstevel@tonic-gate * height of the lgroup topology less than or equal to 2 13647c478bd9Sstevel@tonic-gate */ 13657c478bd9Sstevel@tonic-gate ht_limit = lgrp_topo_ht_limit(); 13667c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1 || ht_limit <= 2) { 13677c478bd9Sstevel@tonic-gate /* 13687c478bd9Sstevel@tonic-gate * Setup lgroup latencies for 2 level lgroup topology 13697c478bd9Sstevel@tonic-gate * (ie. local and remote only) if they haven't been set yet 13707c478bd9Sstevel@tonic-gate */ 13717c478bd9Sstevel@tonic-gate if (ht_limit == 2 && lgrp_plat_probe_time_min == -1 && 13727c478bd9Sstevel@tonic-gate lgrp_plat_probe_time_max == 0) 13737c478bd9Sstevel@tonic-gate lgrp_plat_2level_setup(); 13747c478bd9Sstevel@tonic-gate return; 13757c478bd9Sstevel@tonic-gate } 13767c478bd9Sstevel@tonic-gate 13777c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_op == LGRP_PLAT_PROBE_VENDOR) { 13787c478bd9Sstevel@tonic-gate /* 13797c478bd9Sstevel@tonic-gate * Should have been able to probe from CPU 0 when it was added 13807c478bd9Sstevel@tonic-gate * to lgroup hierarchy, but may not have been able to then 13817c478bd9Sstevel@tonic-gate * because it happens so early in boot that gethrtime() hasn't 13827c478bd9Sstevel@tonic-gate * been initialized. (:-( 13837c478bd9Sstevel@tonic-gate */ 13847c478bd9Sstevel@tonic-gate curnode = LGRP_PLAT_CPU_TO_NODE(CPU); 13857c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[curnode][curnode] == 0) 13867c478bd9Sstevel@tonic-gate lgrp_plat_probe(); 13877c478bd9Sstevel@tonic-gate 13887c478bd9Sstevel@tonic-gate return; 13897c478bd9Sstevel@tonic-gate } 13907c478bd9Sstevel@tonic-gate 13917c478bd9Sstevel@tonic-gate /* 13927c478bd9Sstevel@tonic-gate * When probing memory, use one page for every sample to determine 13937c478bd9Sstevel@tonic-gate * lgroup topology and taking multiple samples 13947c478bd9Sstevel@tonic-gate */ 13957c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_memsize == 0) 13967c478bd9Sstevel@tonic-gate lgrp_plat_probe_memsize = PAGESIZE * 13977c478bd9Sstevel@tonic-gate lgrp_plat_probe_nsamples; 13987c478bd9Sstevel@tonic-gate 13997c478bd9Sstevel@tonic-gate /* 14007c478bd9Sstevel@tonic-gate * Map memory in each node needed for probing to determine latency 14017c478bd9Sstevel@tonic-gate * topology 14027c478bd9Sstevel@tonic-gate */ 14037c478bd9Sstevel@tonic-gate for (i = 0; i < lgrp_plat_node_cnt; i++) { 14047c478bd9Sstevel@tonic-gate int mnode; 14057c478bd9Sstevel@tonic-gate 14067c478bd9Sstevel@tonic-gate /* 14077c478bd9Sstevel@tonic-gate * Skip this node and leave its probe page NULL 14087c478bd9Sstevel@tonic-gate * if it doesn't have any memory 14097c478bd9Sstevel@tonic-gate */ 14107c478bd9Sstevel@tonic-gate mnode = plat_lgrphand_to_mem_node((lgrp_handle_t)i); 14117c478bd9Sstevel@tonic-gate if (!mem_node_config[mnode].exists) { 14127c478bd9Sstevel@tonic-gate lgrp_plat_probe_memory[i] = NULL; 14137c478bd9Sstevel@tonic-gate continue; 14147c478bd9Sstevel@tonic-gate } 14157c478bd9Sstevel@tonic-gate 14167c478bd9Sstevel@tonic-gate /* 14177c478bd9Sstevel@tonic-gate * Allocate one kernel virtual page 14187c478bd9Sstevel@tonic-gate */ 14197c478bd9Sstevel@tonic-gate lgrp_plat_probe_memory[i] = vmem_alloc(heap_arena, 14207c478bd9Sstevel@tonic-gate lgrp_plat_probe_memsize, VM_NOSLEEP); 14217c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_memory[i] == NULL) { 14227c478bd9Sstevel@tonic-gate cmn_err(CE_WARN, 14237c478bd9Sstevel@tonic-gate "lgrp_plat_main_init: couldn't allocate memory"); 14247c478bd9Sstevel@tonic-gate return; 14257c478bd9Sstevel@tonic-gate } 14267c478bd9Sstevel@tonic-gate 14277c478bd9Sstevel@tonic-gate /* 14287c478bd9Sstevel@tonic-gate * Map virtual page to first page in node 14297c478bd9Sstevel@tonic-gate */ 14307c478bd9Sstevel@tonic-gate hat_devload(kas.a_hat, lgrp_plat_probe_memory[i], 14317c478bd9Sstevel@tonic-gate lgrp_plat_probe_memsize, 14327c478bd9Sstevel@tonic-gate lgrp_plat_probe_pfn[i], 14337c478bd9Sstevel@tonic-gate PROT_READ | PROT_WRITE | HAT_PLAT_NOCACHE, 14347c478bd9Sstevel@tonic-gate HAT_LOAD_NOCONSIST); 14357c478bd9Sstevel@tonic-gate } 14367c478bd9Sstevel@tonic-gate 14377c478bd9Sstevel@tonic-gate /* 14387c478bd9Sstevel@tonic-gate * Probe from current CPU 14397c478bd9Sstevel@tonic-gate */ 14407c478bd9Sstevel@tonic-gate lgrp_plat_probe(); 14417c478bd9Sstevel@tonic-gate } 14427c478bd9Sstevel@tonic-gate 14437c478bd9Sstevel@tonic-gate /* 14447c478bd9Sstevel@tonic-gate * Allocate additional space for an lgroup. 14457c478bd9Sstevel@tonic-gate */ 14467c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14477c478bd9Sstevel@tonic-gate lgrp_t * 14487c478bd9Sstevel@tonic-gate lgrp_plat_alloc(lgrp_id_t lgrpid) 14497c478bd9Sstevel@tonic-gate { 14507c478bd9Sstevel@tonic-gate lgrp_t *lgrp; 14517c478bd9Sstevel@tonic-gate 14527c478bd9Sstevel@tonic-gate lgrp = &lgrp_space[nlgrps_alloc++]; 14537c478bd9Sstevel@tonic-gate if (lgrpid >= NLGRP || nlgrps_alloc > NLGRP) 14547c478bd9Sstevel@tonic-gate return (NULL); 14557c478bd9Sstevel@tonic-gate return (lgrp); 14567c478bd9Sstevel@tonic-gate } 14577c478bd9Sstevel@tonic-gate 14587c478bd9Sstevel@tonic-gate /* 14597c478bd9Sstevel@tonic-gate * Platform handling for (re)configuration changes 14607c478bd9Sstevel@tonic-gate */ 14617c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14627c478bd9Sstevel@tonic-gate void 14637c478bd9Sstevel@tonic-gate lgrp_plat_config(lgrp_config_flag_t flag, uintptr_t arg) 14647c478bd9Sstevel@tonic-gate { 14657c478bd9Sstevel@tonic-gate } 14667c478bd9Sstevel@tonic-gate 14677c478bd9Sstevel@tonic-gate /* 14687c478bd9Sstevel@tonic-gate * Return the platform handle for the lgroup containing the given CPU 14697c478bd9Sstevel@tonic-gate */ 14707c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14717c478bd9Sstevel@tonic-gate lgrp_handle_t 14727c478bd9Sstevel@tonic-gate lgrp_plat_cpu_to_hand(processorid_t id) 14737c478bd9Sstevel@tonic-gate { 14747c478bd9Sstevel@tonic-gate if (lgrp_plat_node_cnt == 1) 14757c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 14767c478bd9Sstevel@tonic-gate 14777c478bd9Sstevel@tonic-gate return ((lgrp_handle_t)LGRP_PLAT_CPU_TO_NODE(cpu[id])); 14787c478bd9Sstevel@tonic-gate } 14797c478bd9Sstevel@tonic-gate 14807c478bd9Sstevel@tonic-gate /* 14817c478bd9Sstevel@tonic-gate * Return the platform handle of the lgroup that contains the physical memory 14827c478bd9Sstevel@tonic-gate * corresponding to the given page frame number 14837c478bd9Sstevel@tonic-gate */ 14847c478bd9Sstevel@tonic-gate /* ARGSUSED */ 14857c478bd9Sstevel@tonic-gate lgrp_handle_t 14867c478bd9Sstevel@tonic-gate lgrp_plat_pfn_to_hand(pfn_t pfn) 14877c478bd9Sstevel@tonic-gate { 14887c478bd9Sstevel@tonic-gate int mnode; 14897c478bd9Sstevel@tonic-gate 14907c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 14917c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 14927c478bd9Sstevel@tonic-gate 1493c39996a7Sstevel if (pfn > physmax) 1494c39996a7Sstevel return (LGRP_NULL_HANDLE); 1495c39996a7Sstevel 14967c478bd9Sstevel@tonic-gate mnode = plat_pfn_to_mem_node(pfn); 1497c39996a7Sstevel if (mnode < 0) 1498c39996a7Sstevel return (LGRP_NULL_HANDLE); 1499c39996a7Sstevel 15007c478bd9Sstevel@tonic-gate return (MEM_NODE_2_LGRPHAND(mnode)); 15017c478bd9Sstevel@tonic-gate } 15027c478bd9Sstevel@tonic-gate 15037c478bd9Sstevel@tonic-gate /* 15047c478bd9Sstevel@tonic-gate * Return the maximum number of lgrps supported by the platform. 15057c478bd9Sstevel@tonic-gate * Before lgrp topology is known it returns an estimate based on the number of 15067c478bd9Sstevel@tonic-gate * nodes. Once topology is known it returns the actual maximim number of lgrps 15077c478bd9Sstevel@tonic-gate * created. Since x86 doesn't support dynamic addition of new nodes, this number 15087c478bd9Sstevel@tonic-gate * may not grow during system lifetime. 15097c478bd9Sstevel@tonic-gate */ 15107c478bd9Sstevel@tonic-gate int 15117c478bd9Sstevel@tonic-gate lgrp_plat_max_lgrps() 15127c478bd9Sstevel@tonic-gate { 15137c478bd9Sstevel@tonic-gate return (lgrp_topo_initialized ? 15147c478bd9Sstevel@tonic-gate lgrp_alloc_max + 1 : 15157c478bd9Sstevel@tonic-gate lgrp_plat_node_cnt * (lgrp_plat_node_cnt - 1) + 1); 15167c478bd9Sstevel@tonic-gate } 15177c478bd9Sstevel@tonic-gate 15187c478bd9Sstevel@tonic-gate /* 15197c478bd9Sstevel@tonic-gate * Return the number of free, allocatable, or installed 15207c478bd9Sstevel@tonic-gate * pages in an lgroup 15217c478bd9Sstevel@tonic-gate * This is a copy of the MAX_MEM_NODES == 1 version of the routine 15227c478bd9Sstevel@tonic-gate * used when MPO is disabled (i.e. single lgroup) or this is the root lgroup 15237c478bd9Sstevel@tonic-gate */ 15247c478bd9Sstevel@tonic-gate /* ARGSUSED */ 15257c478bd9Sstevel@tonic-gate static pgcnt_t 15267c478bd9Sstevel@tonic-gate lgrp_plat_mem_size_default(lgrp_handle_t lgrphand, lgrp_mem_query_t query) 15277c478bd9Sstevel@tonic-gate { 15287c478bd9Sstevel@tonic-gate struct memlist *mlist; 15297c478bd9Sstevel@tonic-gate pgcnt_t npgs = 0; 15307c478bd9Sstevel@tonic-gate extern struct memlist *phys_avail; 15317c478bd9Sstevel@tonic-gate extern struct memlist *phys_install; 15327c478bd9Sstevel@tonic-gate 15337c478bd9Sstevel@tonic-gate switch (query) { 15347c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_FREE: 15357c478bd9Sstevel@tonic-gate return ((pgcnt_t)freemem); 15367c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_AVAIL: 15377c478bd9Sstevel@tonic-gate memlist_read_lock(); 15387c478bd9Sstevel@tonic-gate for (mlist = phys_avail; mlist; mlist = mlist->next) 15397c478bd9Sstevel@tonic-gate npgs += btop(mlist->size); 15407c478bd9Sstevel@tonic-gate memlist_read_unlock(); 15417c478bd9Sstevel@tonic-gate return (npgs); 15427c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_INSTALL: 15437c478bd9Sstevel@tonic-gate memlist_read_lock(); 15447c478bd9Sstevel@tonic-gate for (mlist = phys_install; mlist; mlist = mlist->next) 15457c478bd9Sstevel@tonic-gate npgs += btop(mlist->size); 15467c478bd9Sstevel@tonic-gate memlist_read_unlock(); 15477c478bd9Sstevel@tonic-gate return (npgs); 15487c478bd9Sstevel@tonic-gate default: 15497c478bd9Sstevel@tonic-gate return ((pgcnt_t)0); 15507c478bd9Sstevel@tonic-gate } 15517c478bd9Sstevel@tonic-gate } 15527c478bd9Sstevel@tonic-gate 15537c478bd9Sstevel@tonic-gate /* 15547c478bd9Sstevel@tonic-gate * Return the number of free pages in an lgroup. 15557c478bd9Sstevel@tonic-gate * 15567c478bd9Sstevel@tonic-gate * For query of LGRP_MEM_SIZE_FREE, return the number of base pagesize 15577c478bd9Sstevel@tonic-gate * pages on freelists. For query of LGRP_MEM_SIZE_AVAIL, return the 15587c478bd9Sstevel@tonic-gate * number of allocatable base pagesize pages corresponding to the 15597c478bd9Sstevel@tonic-gate * lgroup (e.g. do not include page_t's, BOP_ALLOC()'ed memory, ..) 15607c478bd9Sstevel@tonic-gate * For query of LGRP_MEM_SIZE_INSTALL, return the amount of physical 15617c478bd9Sstevel@tonic-gate * memory installed, regardless of whether or not it's usable. 15627c478bd9Sstevel@tonic-gate */ 15637c478bd9Sstevel@tonic-gate pgcnt_t 15647c478bd9Sstevel@tonic-gate lgrp_plat_mem_size(lgrp_handle_t plathand, lgrp_mem_query_t query) 15657c478bd9Sstevel@tonic-gate { 15667c478bd9Sstevel@tonic-gate int mnode; 15677c478bd9Sstevel@tonic-gate pgcnt_t npgs = (pgcnt_t)0; 15687c478bd9Sstevel@tonic-gate extern struct memlist *phys_avail; 15697c478bd9Sstevel@tonic-gate extern struct memlist *phys_install; 15707c478bd9Sstevel@tonic-gate 15717c478bd9Sstevel@tonic-gate 15727c478bd9Sstevel@tonic-gate if (plathand == LGRP_DEFAULT_HANDLE) 15737c478bd9Sstevel@tonic-gate return (lgrp_plat_mem_size_default(plathand, query)); 15747c478bd9Sstevel@tonic-gate 15757c478bd9Sstevel@tonic-gate if (plathand != LGRP_NULL_HANDLE) { 15767c478bd9Sstevel@tonic-gate mnode = plat_lgrphand_to_mem_node(plathand); 15777c478bd9Sstevel@tonic-gate if (mnode >= 0 && mem_node_config[mnode].exists) { 15787c478bd9Sstevel@tonic-gate switch (query) { 15797c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_FREE: 1580affbd3ccSkchow npgs = MNODE_PGCNT(mnode); 15817c478bd9Sstevel@tonic-gate break; 15827c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_AVAIL: 15837c478bd9Sstevel@tonic-gate npgs = mem_node_memlist_pages(mnode, 15847c478bd9Sstevel@tonic-gate phys_avail); 15857c478bd9Sstevel@tonic-gate break; 15867c478bd9Sstevel@tonic-gate case LGRP_MEM_SIZE_INSTALL: 15877c478bd9Sstevel@tonic-gate npgs = mem_node_memlist_pages(mnode, 15887c478bd9Sstevel@tonic-gate phys_install); 15897c478bd9Sstevel@tonic-gate break; 15907c478bd9Sstevel@tonic-gate default: 15917c478bd9Sstevel@tonic-gate break; 15927c478bd9Sstevel@tonic-gate } 15937c478bd9Sstevel@tonic-gate } 15947c478bd9Sstevel@tonic-gate } 15957c478bd9Sstevel@tonic-gate return (npgs); 15967c478bd9Sstevel@tonic-gate } 15977c478bd9Sstevel@tonic-gate 15987c478bd9Sstevel@tonic-gate /* 15997c478bd9Sstevel@tonic-gate * Return latency between "from" and "to" lgroups 16007c478bd9Sstevel@tonic-gate * 16017c478bd9Sstevel@tonic-gate * This latency number can only be used for relative comparison 16027c478bd9Sstevel@tonic-gate * between lgroups on the running system, cannot be used across platforms, 16037c478bd9Sstevel@tonic-gate * and may not reflect the actual latency. It is platform and implementation 16047c478bd9Sstevel@tonic-gate * specific, so platform gets to decide its value. It would be nice if the 16057c478bd9Sstevel@tonic-gate * number was at least proportional to make comparisons more meaningful though. 16067c478bd9Sstevel@tonic-gate */ 16077c478bd9Sstevel@tonic-gate /* ARGSUSED */ 16087c478bd9Sstevel@tonic-gate int 16097c478bd9Sstevel@tonic-gate lgrp_plat_latency(lgrp_handle_t from, lgrp_handle_t to) 16107c478bd9Sstevel@tonic-gate { 16117c478bd9Sstevel@tonic-gate lgrp_handle_t src, dest; 16127c478bd9Sstevel@tonic-gate 16137c478bd9Sstevel@tonic-gate if (max_mem_nodes == 1) 16147c478bd9Sstevel@tonic-gate return (0); 16157c478bd9Sstevel@tonic-gate 16167c478bd9Sstevel@tonic-gate /* 16177c478bd9Sstevel@tonic-gate * Return max latency for root lgroup 16187c478bd9Sstevel@tonic-gate */ 16197c478bd9Sstevel@tonic-gate if (from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE) 16207c478bd9Sstevel@tonic-gate return (lgrp_plat_probe_time_max); 16217c478bd9Sstevel@tonic-gate 16227c478bd9Sstevel@tonic-gate src = from; 16237c478bd9Sstevel@tonic-gate dest = to; 16247c478bd9Sstevel@tonic-gate 16257c478bd9Sstevel@tonic-gate /* 16267c478bd9Sstevel@tonic-gate * Return 0 for nodes (lgroup platform handles) out of range 16277c478bd9Sstevel@tonic-gate */ 16287c478bd9Sstevel@tonic-gate if (src < 0 || src >= MAX_NODES || dest < 0 || dest >= MAX_NODES) 16297c478bd9Sstevel@tonic-gate return (0); 16307c478bd9Sstevel@tonic-gate 16317c478bd9Sstevel@tonic-gate /* 16327c478bd9Sstevel@tonic-gate * Probe from current CPU if its lgroup latencies haven't been set yet 16337c478bd9Sstevel@tonic-gate * and we are trying to get latency from current CPU to some node 16347c478bd9Sstevel@tonic-gate */ 16357c478bd9Sstevel@tonic-gate if (lgrp_plat_probe_times[src][src] == 0 && 16367c478bd9Sstevel@tonic-gate LGRP_PLAT_CPU_TO_NODE(CPU) == src) 16377c478bd9Sstevel@tonic-gate lgrp_plat_probe(); 16387c478bd9Sstevel@tonic-gate 16397c478bd9Sstevel@tonic-gate return (lgrp_plat_probe_times[src][dest]); 16407c478bd9Sstevel@tonic-gate } 16417c478bd9Sstevel@tonic-gate 16427c478bd9Sstevel@tonic-gate /* 16437c478bd9Sstevel@tonic-gate * Return platform handle for root lgroup 16447c478bd9Sstevel@tonic-gate */ 16457c478bd9Sstevel@tonic-gate lgrp_handle_t 16467c478bd9Sstevel@tonic-gate lgrp_plat_root_hand(void) 16477c478bd9Sstevel@tonic-gate { 16487c478bd9Sstevel@tonic-gate return (LGRP_DEFAULT_HANDLE); 16497c478bd9Sstevel@tonic-gate } 1650