1ce8eb11aSdp78419 /* 2ce8eb11aSdp78419 * CDDL HEADER START 3ce8eb11aSdp78419 * 4ce8eb11aSdp78419 * The contents of this file are subject to the terms of the 5ce8eb11aSdp78419 * Common Development and Distribution License (the "License"). 6ce8eb11aSdp78419 * You may not use this file except in compliance with the License. 7ce8eb11aSdp78419 * 8ce8eb11aSdp78419 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9ce8eb11aSdp78419 * or http://www.opensolaris.org/os/licensing. 10ce8eb11aSdp78419 * See the License for the specific language governing permissions 11ce8eb11aSdp78419 * and limitations under the License. 12ce8eb11aSdp78419 * 13ce8eb11aSdp78419 * When distributing Covered Code, include this CDDL HEADER in each 14ce8eb11aSdp78419 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15ce8eb11aSdp78419 * If applicable, add the following below this CDDL HEADER, with the 16ce8eb11aSdp78419 * fields enclosed by brackets "[]" replaced with your own identifying 17ce8eb11aSdp78419 * information: Portions Copyright [yyyy] [name of copyright owner] 18ce8eb11aSdp78419 * 19ce8eb11aSdp78419 * CDDL HEADER END 20ce8eb11aSdp78419 */ 21ce8eb11aSdp78419 22ce8eb11aSdp78419 /* 23ce8eb11aSdp78419 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24ce8eb11aSdp78419 * Use is subject to license terms. 25ce8eb11aSdp78419 */ 26ce8eb11aSdp78419 27ce8eb11aSdp78419 #pragma ident "%Z%%M% %I% %E% SMI" 28ce8eb11aSdp78419 29ce8eb11aSdp78419 #include <sys/types.h> 30ce8eb11aSdp78419 #include <sys/sysmacros.h> 31ce8eb11aSdp78419 #include <sys/machsystm.h> 32ce8eb11aSdp78419 #include <sys/machparam.h> 33ce8eb11aSdp78419 #include <sys/cmn_err.h> 34ce8eb11aSdp78419 #include <sys/stat.h> 35ce8eb11aSdp78419 #include <sys/mach_descrip.h> 36ce8eb11aSdp78419 #include <sys/memnode.h> 37ce8eb11aSdp78419 #include <sys/mdesc.h> 38ce8eb11aSdp78419 #include <sys/mpo.h> 39ce8eb11aSdp78419 #include <vm/vm_dep.h> 40e853d8c3Sjc25722 #include <vm/hat_sfmmu.h> 41bb57d1f5Sjc25722 #include <sys/promif.h> 42ce8eb11aSdp78419 43ce8eb11aSdp78419 /* 44ce8eb11aSdp78419 * MPO and the sun4v memory representation 45ce8eb11aSdp78419 * --------------------------------------- 46ce8eb11aSdp78419 * 47ce8eb11aSdp78419 * Latency groups are defined in the sun4v achitecture by memory-latency-group 48ce8eb11aSdp78419 * nodes in the Machine Description, as specified in FWARC/2007/260. These 49ce8eb11aSdp78419 * tie together cpu nodes and mblock nodes, and contain mask and match 50ce8eb11aSdp78419 * properties that identify the portion of an mblock that belongs to the 51ce8eb11aSdp78419 * lgroup. Mask and match are defined in the Physical Address (PA) space, 52ce8eb11aSdp78419 * but an mblock defines Real Addresses (RA). To translate, the mblock 53ce8eb11aSdp78419 * includes the property address-congruence-offset, hereafter referred to as 54ce8eb11aSdp78419 * ra_to_pa. A real address ra is a member of an lgroup if 55ce8eb11aSdp78419 * 56ce8eb11aSdp78419 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 57ce8eb11aSdp78419 * 58ce8eb11aSdp78419 * The MD is traversed, and information on all mblocks is kept in the array 59ce8eb11aSdp78419 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 60ce8eb11aSdp78419 * to, is kept in the array mpo_cpu[]. 61ce8eb11aSdp78419 * 62ce8eb11aSdp78419 * This implementation makes (and verifies) the simplifying assumption that 63ce8eb11aSdp78419 * the mask bits are the same for all defined lgroups, and that all 1 bits in 64ce8eb11aSdp78419 * the mask are contiguous. Thus the number of lgroups is bounded by the 65ce8eb11aSdp78419 * number of possible mask values, and the lgrp_handle_t is defined as the 66ce8eb11aSdp78419 * mask value, shifted right to eliminate the 0 bit positions in mask. The 67ce8eb11aSdp78419 * masks and values are also referred to as "home bits" in the code. 68ce8eb11aSdp78419 * 69ce8eb11aSdp78419 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 70ce8eb11aSdp78419 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 71ce8eb11aSdp78419 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 72ce8eb11aSdp78419 * home bits. This yields the mem_node. 73ce8eb11aSdp78419 * 74ce8eb11aSdp78419 * Interfaces 75ce8eb11aSdp78419 * ---------- 76ce8eb11aSdp78419 * 77ce8eb11aSdp78419 * This file exports the following entry points: 78ce8eb11aSdp78419 * 79ce8eb11aSdp78419 * plat_lgrp_init() 80ce8eb11aSdp78419 * plat_build_mem_nodes() 81ce8eb11aSdp78419 * plat_lgrp_cpu_to_hand() 82ce8eb11aSdp78419 * plat_lgrp_latency() 83ce8eb11aSdp78419 * plat_pfn_to_mem_node() 84ce8eb11aSdp78419 * These implement the usual platform lgroup interfaces. 85ce8eb11aSdp78419 * 86ce8eb11aSdp78419 * plat_rapfn_to_papfn() 87ce8eb11aSdp78419 * Recover the PA page coloring bits from an RA. 88ce8eb11aSdp78419 * 89ce8eb11aSdp78419 * plat_mem_node_iterator_init() 90ce8eb11aSdp78419 * Initialize an iterator to efficiently step through pages in a mem_node. 91ce8eb11aSdp78419 * 92ce8eb11aSdp78419 * plat_mem_node_intersect_range() 93ce8eb11aSdp78419 * Find the intersection with a mem_node. 94ce8eb11aSdp78419 */ 95ce8eb11aSdp78419 96ce8eb11aSdp78419 int sun4v_mpo_enable = 1; 97ce8eb11aSdp78419 int sun4v_mpo_debug = 0; 98ce8eb11aSdp78419 char sun4v_mpo_status[256] = ""; 99ce8eb11aSdp78419 100ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */ 101ce8eb11aSdp78419 static struct cpu_md mpo_cpu[NCPU]; 102ce8eb11aSdp78419 103ce8eb11aSdp78419 /* Save lgroup info from the MD */ 104ce8eb11aSdp78419 #define MAX_MD_LGROUPS 32 105ce8eb11aSdp78419 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 106ce8eb11aSdp78419 static int n_lgrpnodes = 0; 107ce8eb11aSdp78419 static int n_locality_groups = 0; 108ce8eb11aSdp78419 static int max_locality_groups = 0; 109ce8eb11aSdp78419 110ce8eb11aSdp78419 /* Save mblocks from the MD */ 111bb57d1f5Sjc25722 #define SMALL_MBLOCKS_COUNT 8 112bb57d1f5Sjc25722 static struct mblock_md *mpo_mblock; 113bb57d1f5Sjc25722 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 114ce8eb11aSdp78419 static int n_mblocks = 0; 115ce8eb11aSdp78419 116ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */ 117bb57d1f5Sjc25722 static mem_stripe_t *mem_stripes; 118bb57d1f5Sjc25722 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 119bb57d1f5Sjc25722 static int mstripesz = 0; 120ce8eb11aSdp78419 static int n_mem_stripes = 0; 121ce8eb11aSdp78419 static pfn_t mnode_stride; /* distance between stripes, start to start */ 122ce8eb11aSdp78419 static int stripe_shift; /* stride/stripes expressed as a shift */ 123ce8eb11aSdp78419 static pfn_t mnode_pages; /* mem_node stripe width */ 124ce8eb11aSdp78419 125ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */ 126ce8eb11aSdp78419 static uint64_t home_mask = 0; 127ce8eb11aSdp78419 static pfn_t home_mask_pfn = 0; 128ce8eb11aSdp78419 static int home_mask_shift = 0; 129ce8eb11aSdp78419 static uint_t home_mask_pfn_shift = 0; 130ce8eb11aSdp78419 131ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */ 132ce8eb11aSdp78419 static int lower_latency = 0; 133ce8eb11aSdp78419 static int higher_latency = 0; 134ce8eb11aSdp78419 135ce8eb11aSdp78419 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 136ce8eb11aSdp78419 137ce8eb11aSdp78419 static int valid_pages(md_t *md, mde_cookie_t cpu0); 138ce8eb11aSdp78419 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 139ce8eb11aSdp78419 static int fix_interleave(void); 140ce8eb11aSdp78419 141ce8eb11aSdp78419 /* Debug support */ 142ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 143ce8eb11aSdp78419 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 144ce8eb11aSdp78419 #else 145ce8eb11aSdp78419 #define MPO_DEBUG(...) 146ce8eb11aSdp78419 #endif /* DEBUG */ 147ce8eb11aSdp78419 148ce8eb11aSdp78419 /* Record status message, viewable from mdb */ 149ce8eb11aSdp78419 #define MPO_STATUS(args...) { \ 150ce8eb11aSdp78419 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 151ce8eb11aSdp78419 MPO_DEBUG(sun4v_mpo_status); \ 152ce8eb11aSdp78419 } 153ce8eb11aSdp78419 154ce8eb11aSdp78419 /* 155ce8eb11aSdp78419 * Routine to read a uint64_t from a given md 156ce8eb11aSdp78419 */ 157ce8eb11aSdp78419 static int64_t 158ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 159ce8eb11aSdp78419 { 160ce8eb11aSdp78419 int err = md_get_prop_val(md, node, propname, val); 161ce8eb11aSdp78419 return (err); 162ce8eb11aSdp78419 } 163ce8eb11aSdp78419 164ce8eb11aSdp78419 static int 165ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b) 166ce8eb11aSdp78419 { 167ce8eb11aSdp78419 struct mblock_md *m1 = (struct mblock_md *)a; 168ce8eb11aSdp78419 struct mblock_md *m2 = (struct mblock_md *)b; 169ce8eb11aSdp78419 170ce8eb11aSdp78419 if (m1->base < m2->base) 171ce8eb11aSdp78419 return (-1); 172ce8eb11aSdp78419 else if (m1->base == m2->base) 173ce8eb11aSdp78419 return (0); 174ce8eb11aSdp78419 else 175ce8eb11aSdp78419 return (1); 176ce8eb11aSdp78419 } 177ce8eb11aSdp78419 178ce8eb11aSdp78419 static void 179ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n) 180ce8eb11aSdp78419 { 181ce8eb11aSdp78419 extern void qsort(void *, size_t, size_t, 182ce8eb11aSdp78419 int (*)(const void *, const void *)); 183ce8eb11aSdp78419 184ce8eb11aSdp78419 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 185ce8eb11aSdp78419 } 186ce8eb11aSdp78419 187*924db11bSjc25722 static void 188*924db11bSjc25722 mpo_update_tunables(void) 189*924db11bSjc25722 { 190*924db11bSjc25722 int i, ncpu_min; 191*924db11bSjc25722 192*924db11bSjc25722 /* 193*924db11bSjc25722 * lgrp_expand_proc_thresh is the minimum load on the lgroups 194*924db11bSjc25722 * this process is currently running on before considering 195*924db11bSjc25722 * expanding threads to another lgroup. 196*924db11bSjc25722 * 197*924db11bSjc25722 * lgrp_expand_proc_diff determines how much less the remote lgroup 198*924db11bSjc25722 * must be loaded before expanding to it. 199*924db11bSjc25722 * 200*924db11bSjc25722 * On sun4v CMT processors, threads share a core pipeline, and 201*924db11bSjc25722 * at less than 100% utilization, best throughput is obtained by 202*924db11bSjc25722 * spreading threads across more cores, even if some are in a 203*924db11bSjc25722 * different lgroup. Spread threads to a new lgroup if the 204*924db11bSjc25722 * current group is more than 50% loaded. Because of virtualization, 205*924db11bSjc25722 * lgroups may have different numbers of CPUs, but the tunables 206*924db11bSjc25722 * apply to all lgroups, so find the smallest lgroup and compute 207*924db11bSjc25722 * 50% loading. 208*924db11bSjc25722 */ 209*924db11bSjc25722 210*924db11bSjc25722 ncpu_min = NCPU; 211*924db11bSjc25722 for (i = 0; i < n_lgrpnodes; i++) { 212*924db11bSjc25722 int ncpu = mpo_lgroup[i].ncpu; 213*924db11bSjc25722 if (ncpu != 0 && ncpu < ncpu_min) 214*924db11bSjc25722 ncpu_min = ncpu; 215*924db11bSjc25722 } 216*924db11bSjc25722 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 217*924db11bSjc25722 218*924db11bSjc25722 /* new home may only be half as loaded as the existing home to use it */ 219*924db11bSjc25722 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 220*924db11bSjc25722 221*924db11bSjc25722 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 222*924db11bSjc25722 } 223*924db11bSjc25722 224*924db11bSjc25722 static mde_cookie_t 225*924db11bSjc25722 cpuid_to_cpunode(md_t *md, int cpuid) 226*924db11bSjc25722 { 227*924db11bSjc25722 mde_cookie_t rootnode, foundnode, *cpunodes; 228*924db11bSjc25722 uint64_t cpuid_prop; 229*924db11bSjc25722 int n_cpunodes, i; 230*924db11bSjc25722 231*924db11bSjc25722 if (md == NULL) 232*924db11bSjc25722 return (MDE_INVAL_ELEM_COOKIE); 233*924db11bSjc25722 234*924db11bSjc25722 rootnode = md_root_node(md); 235*924db11bSjc25722 if (rootnode == MDE_INVAL_ELEM_COOKIE) 236*924db11bSjc25722 return (MDE_INVAL_ELEM_COOKIE); 237*924db11bSjc25722 238*924db11bSjc25722 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU, 239*924db11bSjc25722 "fwd", &cpunodes); 240*924db11bSjc25722 if (n_cpunodes <= 0 || n_cpunodes > NCPU) 241*924db11bSjc25722 goto cpuid_fail; 242*924db11bSjc25722 243*924db11bSjc25722 for (i = 0; i < n_cpunodes; i++) { 244*924db11bSjc25722 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID, 245*924db11bSjc25722 &cpuid_prop)) 246*924db11bSjc25722 break; 247*924db11bSjc25722 if (cpuid_prop == (uint64_t)cpuid) { 248*924db11bSjc25722 foundnode = cpunodes[i]; 249*924db11bSjc25722 md_free_scan_dag(md, &cpunodes); 250*924db11bSjc25722 return (foundnode); 251*924db11bSjc25722 } 252*924db11bSjc25722 } 253*924db11bSjc25722 cpuid_fail: 254*924db11bSjc25722 if (n_cpunodes > 0) 255*924db11bSjc25722 md_free_scan_dag(md, &cpunodes); 256*924db11bSjc25722 return (MDE_INVAL_ELEM_COOKIE); 257*924db11bSjc25722 } 258*924db11bSjc25722 259*924db11bSjc25722 static int 260*924db11bSjc25722 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode) 261*924db11bSjc25722 { 262*924db11bSjc25722 mde_cookie_t *nodes; 263*924db11bSjc25722 uint64_t latency, lowest_latency; 264*924db11bSjc25722 uint64_t address_match, lowest_address_match; 265*924db11bSjc25722 int n_lgroups, j, result = 0; 266*924db11bSjc25722 267*924db11bSjc25722 /* Find lgroup nodes reachable from this cpu */ 268*924db11bSjc25722 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG, 269*924db11bSjc25722 "fwd", &nodes); 270*924db11bSjc25722 271*924db11bSjc25722 lowest_latency = ~(0UL); 272*924db11bSjc25722 273*924db11bSjc25722 /* Find the lgroup node with the smallest latency */ 274*924db11bSjc25722 for (j = 0; j < n_lgroups; j++) { 275*924db11bSjc25722 result = get_int(md, nodes[j], PROP_LG_LATENCY, 276*924db11bSjc25722 &latency); 277*924db11bSjc25722 result |= get_int(md, nodes[j], PROP_LG_MATCH, 278*924db11bSjc25722 &address_match); 279*924db11bSjc25722 if (result != 0) { 280*924db11bSjc25722 j = -1; 281*924db11bSjc25722 goto to_lgrp_done; 282*924db11bSjc25722 } 283*924db11bSjc25722 if (latency < lowest_latency) { 284*924db11bSjc25722 lowest_latency = latency; 285*924db11bSjc25722 lowest_address_match = address_match; 286*924db11bSjc25722 } 287*924db11bSjc25722 } 288*924db11bSjc25722 for (j = 0; j < n_lgrpnodes; j++) { 289*924db11bSjc25722 if ((mpo_lgroup[j].latency == lowest_latency) && 290*924db11bSjc25722 (mpo_lgroup[j].addr_match == lowest_address_match)) 291*924db11bSjc25722 break; 292*924db11bSjc25722 } 293*924db11bSjc25722 if (j == n_lgrpnodes) 294*924db11bSjc25722 j = -1; 295*924db11bSjc25722 296*924db11bSjc25722 to_lgrp_done: 297*924db11bSjc25722 if (n_lgroups > 0) 298*924db11bSjc25722 md_free_scan_dag(md, &nodes); 299*924db11bSjc25722 return (j); 300*924db11bSjc25722 } 301*924db11bSjc25722 302*924db11bSjc25722 /* Called when DR'ing in a CPU */ 303*924db11bSjc25722 void 304*924db11bSjc25722 mpo_cpu_add(int cpuid) 305*924db11bSjc25722 { 306*924db11bSjc25722 md_t *md; 307*924db11bSjc25722 mde_cookie_t cpunode; 308*924db11bSjc25722 309*924db11bSjc25722 int i; 310*924db11bSjc25722 311*924db11bSjc25722 if (n_lgrpnodes <= 0) 312*924db11bSjc25722 return; 313*924db11bSjc25722 314*924db11bSjc25722 md = md_get_handle(); 315*924db11bSjc25722 316*924db11bSjc25722 if (md == NULL) 317*924db11bSjc25722 goto add_fail; 318*924db11bSjc25722 319*924db11bSjc25722 cpunode = cpuid_to_cpunode(md, cpuid); 320*924db11bSjc25722 if (cpunode == MDE_INVAL_ELEM_COOKIE) 321*924db11bSjc25722 goto add_fail; 322*924db11bSjc25722 323*924db11bSjc25722 i = mpo_cpu_to_lgroup(md, cpunode); 324*924db11bSjc25722 if (i == -1) 325*924db11bSjc25722 goto add_fail; 326*924db11bSjc25722 327*924db11bSjc25722 mpo_cpu[cpuid].lgrp_index = i; 328*924db11bSjc25722 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift; 329*924db11bSjc25722 mpo_lgroup[i].ncpu++; 330*924db11bSjc25722 mpo_update_tunables(); 331*924db11bSjc25722 (void) md_fini_handle(md); 332*924db11bSjc25722 return; 333*924db11bSjc25722 add_fail: 334*924db11bSjc25722 panic("mpo_cpu_add: Cannot read MD"); 335*924db11bSjc25722 } 336*924db11bSjc25722 337*924db11bSjc25722 /* Called when DR'ing out a CPU */ 338*924db11bSjc25722 void 339*924db11bSjc25722 mpo_cpu_remove(int cpuid) 340*924db11bSjc25722 { 341*924db11bSjc25722 int i; 342*924db11bSjc25722 343*924db11bSjc25722 if (n_lgrpnodes <= 0) 344*924db11bSjc25722 return; 345*924db11bSjc25722 346*924db11bSjc25722 i = mpo_cpu[cpuid].lgrp_index; 347*924db11bSjc25722 mpo_lgroup[i].ncpu--; 348*924db11bSjc25722 mpo_cpu[cpuid].home = 0; 349*924db11bSjc25722 mpo_cpu[cpuid].lgrp_index = -1; 350*924db11bSjc25722 mpo_update_tunables(); 351*924db11bSjc25722 } 352*924db11bSjc25722 353ce8eb11aSdp78419 /* 354ce8eb11aSdp78419 * 355ce8eb11aSdp78419 * Traverse the MD to determine: 356ce8eb11aSdp78419 * 357ce8eb11aSdp78419 * Number of CPU nodes, lgrp_nodes, and mblocks 358ce8eb11aSdp78419 * Then for each lgrp_node, obtain the appropriate data. 359ce8eb11aSdp78419 * For each CPU, determine its home locality and store it. 360ce8eb11aSdp78419 * For each mblock, retrieve its data and store it. 361ce8eb11aSdp78419 */ 362ce8eb11aSdp78419 static int 363ce8eb11aSdp78419 lgrp_traverse(md_t *md) 364ce8eb11aSdp78419 { 365ce8eb11aSdp78419 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 366ce8eb11aSdp78419 uint64_t i, j, k, o, n_nodes; 367ce8eb11aSdp78419 uint64_t mem_lg_homeset = 0; 368ce8eb11aSdp78419 int ret_val = 0; 369ce8eb11aSdp78419 int result = 0; 370ce8eb11aSdp78419 int n_cpunodes = 0; 371ce8eb11aSdp78419 int sub_page_fix; 372bb57d1f5Sjc25722 int mblocksz = 0; 373bb57d1f5Sjc25722 size_t allocsz; 374ce8eb11aSdp78419 375ce8eb11aSdp78419 n_nodes = md_node_count(md); 376ce8eb11aSdp78419 377ce8eb11aSdp78419 if (n_nodes <= 0) { 378ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 379ce8eb11aSdp78419 ret_val = -1; 380ce8eb11aSdp78419 goto fail; 381ce8eb11aSdp78419 } 382ce8eb11aSdp78419 383ce8eb11aSdp78419 root = md_root_node(md); 384ce8eb11aSdp78419 385ce8eb11aSdp78419 if (root == MDE_INVAL_ELEM_COOKIE) { 386ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 387ce8eb11aSdp78419 ret_val = -1; 388ce8eb11aSdp78419 goto fail; 389ce8eb11aSdp78419 } 390ce8eb11aSdp78419 391ce8eb11aSdp78419 /* 392ce8eb11aSdp78419 * Build the Memory Nodes. Do this before any possibility of 393ce8eb11aSdp78419 * bailing from this routine so we obtain ra_to_pa (needed for page 394ce8eb11aSdp78419 * coloring) even when there are no lgroups defined. 395ce8eb11aSdp78419 */ 396ce8eb11aSdp78419 397ce8eb11aSdp78419 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 398ce8eb11aSdp78419 "fwd", &mblocknodes); 399ce8eb11aSdp78419 400bb57d1f5Sjc25722 if (n_mblocks <= 0) { 401ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No mblock " 402ce8eb11aSdp78419 "nodes detected in Machine Descriptor\n"); 403ce8eb11aSdp78419 n_mblocks = 0; 404ce8eb11aSdp78419 ret_val = -1; 405ce8eb11aSdp78419 goto fail; 406ce8eb11aSdp78419 } 407bb57d1f5Sjc25722 /* 408bb57d1f5Sjc25722 * If we have a small number of mblocks we will use the space 409bb57d1f5Sjc25722 * that we preallocated. Otherwise, we will dynamically 410bb57d1f5Sjc25722 * allocate the space 411bb57d1f5Sjc25722 */ 412bb57d1f5Sjc25722 mblocksz = n_mblocks * sizeof (struct mblock_md); 413bb57d1f5Sjc25722 mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t); 414bb57d1f5Sjc25722 415bb57d1f5Sjc25722 if (n_mblocks <= SMALL_MBLOCKS_COUNT) { 416bb57d1f5Sjc25722 mpo_mblock = &small_mpo_mblocks[0]; 417bb57d1f5Sjc25722 mem_stripes = &small_mem_stripes[0]; 418bb57d1f5Sjc25722 } else { 419bb57d1f5Sjc25722 allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 420bb57d1f5Sjc25722 /* Ensure that we dont request more space than reserved */ 421bb57d1f5Sjc25722 if (allocsz > MPOBUF_SIZE) { 422bb57d1f5Sjc25722 MPO_STATUS("lgrp_traverse: Insufficient space " 423bb57d1f5Sjc25722 "for mblock structures \n"); 424bb57d1f5Sjc25722 ret_val = -1; 425bb57d1f5Sjc25722 n_mblocks = 0; 426bb57d1f5Sjc25722 goto fail; 427bb57d1f5Sjc25722 } 428bb57d1f5Sjc25722 mpo_mblock = (struct mblock_md *) 429bb57d1f5Sjc25722 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 430bb57d1f5Sjc25722 if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) { 431bb57d1f5Sjc25722 MPO_STATUS("lgrp_traverse: Cannot allocate space " 432bb57d1f5Sjc25722 "for mblocks \n"); 433bb57d1f5Sjc25722 ret_val = -1; 434bb57d1f5Sjc25722 n_mblocks = 0; 435bb57d1f5Sjc25722 goto fail; 436bb57d1f5Sjc25722 } 437bb57d1f5Sjc25722 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 438bb57d1f5Sjc25722 mpo_heap32_bufsz = MPOBUF_SIZE; 439bb57d1f5Sjc25722 440bb57d1f5Sjc25722 mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks); 441bb57d1f5Sjc25722 } 442ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 443ce8eb11aSdp78419 mpo_mblock[i].node = mblocknodes[i]; 444ce8eb11aSdp78419 445ce8eb11aSdp78419 /* Without a base or size value we will fail */ 446ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 447ce8eb11aSdp78419 &mpo_mblock[i].base); 448ce8eb11aSdp78419 if (result < 0) { 449ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 450ce8eb11aSdp78419 "PROP_LG_BASE is missing\n"); 451ce8eb11aSdp78419 n_mblocks = 0; 452ce8eb11aSdp78419 ret_val = -1; 453ce8eb11aSdp78419 goto fail; 454ce8eb11aSdp78419 } 455ce8eb11aSdp78419 456ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 457ce8eb11aSdp78419 &mpo_mblock[i].size); 458ce8eb11aSdp78419 if (result < 0) { 459ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 460ce8eb11aSdp78419 "PROP_LG_SIZE is missing\n"); 461ce8eb11aSdp78419 n_mblocks = 0; 462ce8eb11aSdp78419 ret_val = -1; 463ce8eb11aSdp78419 goto fail; 464ce8eb11aSdp78419 } 465ce8eb11aSdp78419 466ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], 467ce8eb11aSdp78419 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 468ce8eb11aSdp78419 469ce8eb11aSdp78419 /* If we don't have an ra_pa_offset, just set it to 0 */ 470ce8eb11aSdp78419 if (result < 0) 471ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa = 0; 472ce8eb11aSdp78419 473ce8eb11aSdp78419 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 474ce8eb11aSdp78419 "ra_to_pa = %lx\n", i, 475ce8eb11aSdp78419 mpo_mblock[i].base, 476ce8eb11aSdp78419 mpo_mblock[i].size, 477ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa); 478ce8eb11aSdp78419 } 479ce8eb11aSdp78419 480ce8eb11aSdp78419 /* Must sort mblocks by address for mem_node_iterator_init() */ 481ce8eb11aSdp78419 mblock_sort(mpo_mblock, n_mblocks); 482ce8eb11aSdp78419 483ce8eb11aSdp78419 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 484ce8eb11aSdp78419 485ce8eb11aSdp78419 /* Page coloring hook is required so we can iterate through mnodes */ 486ce8eb11aSdp78419 if (&page_next_pfn_for_color_cpu == NULL) { 487ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 488ce8eb11aSdp78419 ret_val = -1; 489ce8eb11aSdp78419 goto fail; 490ce8eb11aSdp78419 } 491ce8eb11aSdp78419 492ce8eb11aSdp78419 /* Global enable for mpo */ 493ce8eb11aSdp78419 if (sun4v_mpo_enable == 0) { 494ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 495ce8eb11aSdp78419 ret_val = -1; 496ce8eb11aSdp78419 goto fail; 497ce8eb11aSdp78419 } 498ce8eb11aSdp78419 499ce8eb11aSdp78419 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 500ce8eb11aSdp78419 "fwd", &lgrpnodes); 501ce8eb11aSdp78419 502ce8eb11aSdp78419 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 503ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 504ce8eb11aSdp78419 ret_val = -1; 505ce8eb11aSdp78419 goto fail; 506ce8eb11aSdp78419 } 507ce8eb11aSdp78419 508ce8eb11aSdp78419 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 509ce8eb11aSdp78419 510ce8eb11aSdp78419 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 511ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 512ce8eb11aSdp78419 "in MD\n"); 513ce8eb11aSdp78419 ret_val = -1; 514ce8eb11aSdp78419 goto fail; 515ce8eb11aSdp78419 } 516ce8eb11aSdp78419 517ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 518ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 519ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 520ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 521ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 522ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 523ce8eb11aSdp78419 524ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 525ce8eb11aSdp78419 mpo_lgroup[i].node = lgrpnodes[i]; 526ce8eb11aSdp78419 mpo_lgroup[i].id = i; 527ce8eb11aSdp78419 mpo_lgroup[i].ncpu = 0; 528ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 529ce8eb11aSdp78419 &mpo_lgroup[i].addr_mask); 530ce8eb11aSdp78419 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 531ce8eb11aSdp78419 &mpo_lgroup[i].addr_match); 532ce8eb11aSdp78419 533ce8eb11aSdp78419 /* 534ce8eb11aSdp78419 * If either the mask or match properties are missing, set to 0 535ce8eb11aSdp78419 */ 536ce8eb11aSdp78419 if (result < 0) { 537ce8eb11aSdp78419 mpo_lgroup[i].addr_mask = 0; 538ce8eb11aSdp78419 mpo_lgroup[i].addr_match = 0; 539ce8eb11aSdp78419 } 540ce8eb11aSdp78419 541ce8eb11aSdp78419 /* Set latency to 0 if property not present */ 542ce8eb11aSdp78419 543ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 544ce8eb11aSdp78419 &mpo_lgroup[i].latency); 545ce8eb11aSdp78419 if (result < 0) 546ce8eb11aSdp78419 mpo_lgroup[i].latency = 0; 547ce8eb11aSdp78419 } 548ce8eb11aSdp78419 549ce8eb11aSdp78419 /* 550ce8eb11aSdp78419 * Sub-page level interleave is not yet supported. Check for it, 551ce8eb11aSdp78419 * and remove sub-page interleaved lgroups from mpo_lgroup and 552ce8eb11aSdp78419 * n_lgrpnodes. If no lgroups are left, return. 553ce8eb11aSdp78419 */ 554ce8eb11aSdp78419 555ce8eb11aSdp78419 sub_page_fix = fix_interleave(); 556ce8eb11aSdp78419 if (n_lgrpnodes == 0) { 557ce8eb11aSdp78419 ret_val = -1; 558ce8eb11aSdp78419 goto fail; 559ce8eb11aSdp78419 } 560ce8eb11aSdp78419 561ce8eb11aSdp78419 /* Ensure that all of the addr_mask values are the same */ 562ce8eb11aSdp78419 563ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 564ce8eb11aSdp78419 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 565ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 566ce8eb11aSdp78419 "addr_mask values are not the same\n"); 567ce8eb11aSdp78419 ret_val = -1; 568ce8eb11aSdp78419 goto fail; 569ce8eb11aSdp78419 } 570ce8eb11aSdp78419 } 571ce8eb11aSdp78419 572ce8eb11aSdp78419 /* 573ce8eb11aSdp78419 * Ensure that all lgrp nodes see all the mblocks. However, if 574ce8eb11aSdp78419 * sub-page interleave is being fixed, they do not, so skip 575ce8eb11aSdp78419 * the check. 576ce8eb11aSdp78419 */ 577ce8eb11aSdp78419 578ce8eb11aSdp78419 if (sub_page_fix == 0) { 579ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 580ce8eb11aSdp78419 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 581ce8eb11aSdp78419 PROP_LG_MBLOCK, "fwd", &nodes); 582ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 583ce8eb11aSdp78419 if (j != n_mblocks) { 584ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 585ce8eb11aSdp78419 "sub-page interleave is being fixed\n"); 586ce8eb11aSdp78419 ret_val = -1; 587ce8eb11aSdp78419 goto fail; 588ce8eb11aSdp78419 } 589ce8eb11aSdp78419 } 590ce8eb11aSdp78419 } 591ce8eb11aSdp78419 592ce8eb11aSdp78419 /* 593ce8eb11aSdp78419 * Use the address mask from the first lgroup node 594ce8eb11aSdp78419 * to establish our home_mask. 595ce8eb11aSdp78419 */ 596ce8eb11aSdp78419 home_mask = mpo_lgroup[0].addr_mask; 597ce8eb11aSdp78419 home_mask_pfn = btop(home_mask); 598ce8eb11aSdp78419 home_mask_shift = lowbit(home_mask) - 1; 599ce8eb11aSdp78419 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 600ce8eb11aSdp78419 mnode_pages = btop(1ULL << home_mask_shift); 601ce8eb11aSdp78419 602ce8eb11aSdp78419 /* 603ce8eb11aSdp78419 * How many values are possible in home mask? Assume the mask 604ce8eb11aSdp78419 * bits are contiguous. 605ce8eb11aSdp78419 */ 606ce8eb11aSdp78419 max_locality_groups = 607ce8eb11aSdp78419 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 608ce8eb11aSdp78419 609ce8eb11aSdp78419 /* Now verify the home mask bits are contiguous */ 610ce8eb11aSdp78419 611ce8eb11aSdp78419 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 612ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 613ce8eb11aSdp78419 "home mask bits are not contiguous\n"); 614ce8eb11aSdp78419 ret_val = -1; 615ce8eb11aSdp78419 goto fail; 616ce8eb11aSdp78419 } 617ce8eb11aSdp78419 618ce8eb11aSdp78419 /* Record all of the home bits */ 619ce8eb11aSdp78419 620ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 621ce8eb11aSdp78419 HOMESET_ADD(mem_lg_homeset, 622ce8eb11aSdp78419 mpo_lgroup[i].addr_match >> home_mask_shift); 623ce8eb11aSdp78419 } 624ce8eb11aSdp78419 625ce8eb11aSdp78419 /* Count the number different "home" mem_lg's we've discovered */ 626ce8eb11aSdp78419 627ce8eb11aSdp78419 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 628ce8eb11aSdp78419 629ce8eb11aSdp78419 /* If we have only 1 locality group then we can exit */ 630ce8eb11aSdp78419 if (n_locality_groups == 1) { 631ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 632ce8eb11aSdp78419 ret_val = -1; 633ce8eb11aSdp78419 goto fail; 634ce8eb11aSdp78419 } 635ce8eb11aSdp78419 636ce8eb11aSdp78419 /* 637ce8eb11aSdp78419 * Set the latencies. A CPU's lgroup is defined by the lowest 638ce8eb11aSdp78419 * latency found. All other memory is considered remote, and the 639ce8eb11aSdp78419 * remote latency is represented by the highest latency found. 640ce8eb11aSdp78419 * Thus hierarchical lgroups, if any, are approximated by a 641ce8eb11aSdp78419 * two level scheme. 642ce8eb11aSdp78419 * 643ce8eb11aSdp78419 * The Solaris MPO framework by convention wants to see latencies 644ce8eb11aSdp78419 * in units of nano-sec/10. In the MD, the units are defined to be 645ce8eb11aSdp78419 * pico-seconds. 646ce8eb11aSdp78419 */ 647ce8eb11aSdp78419 648ce8eb11aSdp78419 lower_latency = mpo_lgroup[0].latency; 649ce8eb11aSdp78419 higher_latency = mpo_lgroup[0].latency; 650ce8eb11aSdp78419 651ce8eb11aSdp78419 for (i = 1; i < n_lgrpnodes; i++) { 652ce8eb11aSdp78419 if (mpo_lgroup[i].latency < lower_latency) { 653ce8eb11aSdp78419 lower_latency = mpo_lgroup[i].latency; 654ce8eb11aSdp78419 } 655ce8eb11aSdp78419 if (mpo_lgroup[i].latency > higher_latency) { 656ce8eb11aSdp78419 higher_latency = mpo_lgroup[i].latency; 657ce8eb11aSdp78419 } 658ce8eb11aSdp78419 } 659ce8eb11aSdp78419 lower_latency /= 10000; 660ce8eb11aSdp78419 higher_latency /= 10000; 661ce8eb11aSdp78419 662ce8eb11aSdp78419 /* Clear our CPU data */ 663ce8eb11aSdp78419 664ce8eb11aSdp78419 for (i = 0; i < NCPU; i++) { 665ce8eb11aSdp78419 mpo_cpu[i].home = 0; 666*924db11bSjc25722 mpo_cpu[i].lgrp_index = -1; 667ce8eb11aSdp78419 } 668ce8eb11aSdp78419 669ce8eb11aSdp78419 /* Build the CPU nodes */ 670ce8eb11aSdp78419 for (i = 0; i < n_cpunodes; i++) { 671ce8eb11aSdp78419 672ce8eb11aSdp78419 /* Read in the lgroup nodes */ 673ce8eb11aSdp78419 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 674ce8eb11aSdp78419 if (result < 0) { 675ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 676ce8eb11aSdp78419 ret_val = -1; 677ce8eb11aSdp78419 goto fail; 678ce8eb11aSdp78419 } 679ce8eb11aSdp78419 680*924db11bSjc25722 o = mpo_cpu_to_lgroup(md, cpunodes[i]); 681*924db11bSjc25722 if (o == -1) { 682ce8eb11aSdp78419 ret_val = -1; 683ce8eb11aSdp78419 goto fail; 684ce8eb11aSdp78419 } 685*924db11bSjc25722 mpo_cpu[k].lgrp_index = o; 686*924db11bSjc25722 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift; 687ce8eb11aSdp78419 mpo_lgroup[o].ncpu++; 688ce8eb11aSdp78419 } 689ce8eb11aSdp78419 /* Validate that no large pages cross mnode boundaries. */ 690ce8eb11aSdp78419 if (valid_pages(md, cpunodes[0]) == 0) { 691ce8eb11aSdp78419 ret_val = -1; 692ce8eb11aSdp78419 goto fail; 693ce8eb11aSdp78419 } 694ce8eb11aSdp78419 695ce8eb11aSdp78419 fail: 696ce8eb11aSdp78419 /* MD cookies are no longer valid; ensure they are not used again. */ 697ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) 698ce8eb11aSdp78419 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 699ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) 700ce8eb11aSdp78419 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 701ce8eb11aSdp78419 702ce8eb11aSdp78419 if (n_cpunodes > 0) 703ce8eb11aSdp78419 md_free_scan_dag(md, &cpunodes); 704ce8eb11aSdp78419 if (n_lgrpnodes > 0) 705ce8eb11aSdp78419 md_free_scan_dag(md, &lgrpnodes); 706ce8eb11aSdp78419 if (n_mblocks > 0) 707ce8eb11aSdp78419 md_free_scan_dag(md, &mblocknodes); 708ce8eb11aSdp78419 else 709ce8eb11aSdp78419 panic("lgrp_traverse: No memory blocks found"); 710ce8eb11aSdp78419 711ce8eb11aSdp78419 if (ret_val == 0) 712ce8eb11aSdp78419 MPO_STATUS("MPO feature is enabled.\n"); 713ce8eb11aSdp78419 714ce8eb11aSdp78419 return (ret_val); 715ce8eb11aSdp78419 } 716ce8eb11aSdp78419 717ce8eb11aSdp78419 /* 718ce8eb11aSdp78419 * Determine the number of unique mem_lg's present in our system 719ce8eb11aSdp78419 */ 720ce8eb11aSdp78419 static int 721ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 722ce8eb11aSdp78419 { 723ce8eb11aSdp78419 int homeid; 724ce8eb11aSdp78419 int count = 0; 725ce8eb11aSdp78419 726ce8eb11aSdp78419 /* 727ce8eb11aSdp78419 * Scan the "home" bits of the mem_lgs, count 728ce8eb11aSdp78419 * the number that are unique. 729ce8eb11aSdp78419 */ 730ce8eb11aSdp78419 731ce8eb11aSdp78419 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 732ce8eb11aSdp78419 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 733ce8eb11aSdp78419 count++; 734ce8eb11aSdp78419 } 735ce8eb11aSdp78419 } 736ce8eb11aSdp78419 737ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 738ce8eb11aSdp78419 mem_lg_homeset); 739ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 740ce8eb11aSdp78419 741ce8eb11aSdp78419 /* Default must be at least one */ 742ce8eb11aSdp78419 if (count == 0) 743ce8eb11aSdp78419 count = 1; 744ce8eb11aSdp78419 745ce8eb11aSdp78419 return (count); 746ce8eb11aSdp78419 } 747ce8eb11aSdp78419 748ce8eb11aSdp78419 /* 749ce8eb11aSdp78419 * Platform specific lgroup initialization 750ce8eb11aSdp78419 */ 751ce8eb11aSdp78419 void 752ce8eb11aSdp78419 plat_lgrp_init(void) 753ce8eb11aSdp78419 { 754ce8eb11aSdp78419 md_t *md; 755*924db11bSjc25722 int rc; 756ce8eb11aSdp78419 757ce8eb11aSdp78419 /* Get the Machine Descriptor handle */ 758ce8eb11aSdp78419 759ce8eb11aSdp78419 md = md_get_handle(); 760ce8eb11aSdp78419 761ce8eb11aSdp78419 /* If not, we cannot continue */ 762ce8eb11aSdp78419 763ce8eb11aSdp78419 if (md == NULL) { 764ce8eb11aSdp78419 panic("cannot access machine descriptor\n"); 765ce8eb11aSdp78419 } else { 766ce8eb11aSdp78419 rc = lgrp_traverse(md); 767ce8eb11aSdp78419 (void) md_fini_handle(md); 768ce8eb11aSdp78419 } 769ce8eb11aSdp78419 770ce8eb11aSdp78419 /* 771ce8eb11aSdp78419 * If we can't process the MD for lgroups then at least let the 772ce8eb11aSdp78419 * system try to boot. Assume we have one lgroup so that 773ce8eb11aSdp78419 * when plat_build_mem_nodes is called, it will attempt to init 774ce8eb11aSdp78419 * an mnode based on the supplied memory segment. 775ce8eb11aSdp78419 */ 776ce8eb11aSdp78419 777ce8eb11aSdp78419 if (rc == -1) { 778ce8eb11aSdp78419 home_mask_pfn = 0; 779ce8eb11aSdp78419 max_locality_groups = 1; 780ce8eb11aSdp78419 n_locality_groups = 1; 781ce8eb11aSdp78419 return; 782ce8eb11aSdp78419 } 783ce8eb11aSdp78419 784ce8eb11aSdp78419 mem_node_pfn_shift = 0; 785ce8eb11aSdp78419 mem_node_physalign = 0; 786ce8eb11aSdp78419 787ce8eb11aSdp78419 /* Use lgroup-aware TSB allocations */ 788ce8eb11aSdp78419 tsb_lgrp_affinity = 1; 789ce8eb11aSdp78419 790ce8eb11aSdp78419 /* Require that a home lgroup have some memory to be chosen */ 791ce8eb11aSdp78419 lgrp_mem_free_thresh = 1; 792ce8eb11aSdp78419 793ce8eb11aSdp78419 /* Standard home-on-next-touch policy */ 794ce8eb11aSdp78419 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 795ce8eb11aSdp78419 796ce8eb11aSdp78419 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 797ce8eb11aSdp78419 lgrp_load_thresh = UINT32_MAX; 798*924db11bSjc25722 799*924db11bSjc25722 mpo_update_tunables(); 800ce8eb11aSdp78419 } 801ce8eb11aSdp78419 802ce8eb11aSdp78419 /* 803ce8eb11aSdp78419 * Helper routine for debugging calls to mem_node_add_slice() 804ce8eb11aSdp78419 */ 805ce8eb11aSdp78419 static void 806ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 807ce8eb11aSdp78419 { 808ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 809ce8eb11aSdp78419 static int slice_count = 0; 810ce8eb11aSdp78419 811ce8eb11aSdp78419 slice_count++; 812ce8eb11aSdp78419 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 813ce8eb11aSdp78419 slice_count, basepfn, endpfn); 814ce8eb11aSdp78419 #endif 815ce8eb11aSdp78419 mem_node_add_slice(basepfn, endpfn); 816ce8eb11aSdp78419 } 817ce8eb11aSdp78419 818ce8eb11aSdp78419 /* 819ce8eb11aSdp78419 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 820ce8eb11aSdp78419 */ 821ce8eb11aSdp78419 static void 822ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 823ce8eb11aSdp78419 { 824ce8eb11aSdp78419 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 825ce8eb11aSdp78419 "mnode index: %d\n", plathand, mnode); 826ce8eb11aSdp78419 plat_assign_lgrphand_to_mem_node(plathand, mnode); 827ce8eb11aSdp78419 } 828ce8eb11aSdp78419 829ce8eb11aSdp78419 /* 830ce8eb11aSdp78419 * plat_build_mem_nodes() 831ce8eb11aSdp78419 * 832ce8eb11aSdp78419 * Define the mem_nodes based on the modified boot memory list, 833ce8eb11aSdp78419 * or based on info read from the MD in plat_lgrp_init(). 834ce8eb11aSdp78419 * 835ce8eb11aSdp78419 * When the home mask lies in the middle of the address bits (as it does on 836ce8eb11aSdp78419 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 837ce8eb11aSdp78419 * it is striped across an mblock in a repeating pattern of contiguous memory 838ce8eb11aSdp78419 * followed by a gap. The stripe width is the size of the contiguous piece. 839ce8eb11aSdp78419 * The stride is the distance from the start of one contiguous piece to the 840ce8eb11aSdp78419 * start of the next. The gap is thus stride - stripe_width. 841ce8eb11aSdp78419 * 842ce8eb11aSdp78419 * The stripe of an mnode that falls within an mblock is described by the type 843ce8eb11aSdp78419 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 844ce8eb11aSdp78419 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 845ce8eb11aSdp78419 * this array is predetermined. The mem_stripe_t that describes mnode m 846ce8eb11aSdp78419 * within mpo_mblock[i] is stored at 847ce8eb11aSdp78419 * mem_stripes[ m + i * max_locality_groups ] 848ce8eb11aSdp78419 * 849ce8eb11aSdp78419 * max_locality_groups is the total number of possible locality groups, 850ce8eb11aSdp78419 * as defined by the size of the home mask, even if the memory assigned 851ce8eb11aSdp78419 * to the domain is small and does not cover all the lgroups. Thus some 852ce8eb11aSdp78419 * mem_stripe_t's may be empty. 853ce8eb11aSdp78419 * 854ce8eb11aSdp78419 * The members of mem_stripe_t are: 855ce8eb11aSdp78419 * physbase: First valid page in mem_node in the corresponding mblock 856ce8eb11aSdp78419 * physmax: Last valid page in mem_node in mblock 857ce8eb11aSdp78419 * offset: The full stripe width starts at physbase - offset. 858ce8eb11aSdp78419 * Thus if offset is non-zero, this mem_node starts in the middle 859ce8eb11aSdp78419 * of a stripe width, and the second full stripe starts at 860ce8eb11aSdp78419 * physbase - offset + stride. (even though physmax may fall in the 861ce8eb11aSdp78419 * middle of a stripe width, we do not save the ending fragment size 862ce8eb11aSdp78419 * in this data structure.) 863ce8eb11aSdp78419 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 864ce8eb11aSdp78419 * 865ce8eb11aSdp78419 * The stripe width is kept in the global mnode_pages. 866ce8eb11aSdp78419 * The stride is kept in the global mnode_stride. 867ce8eb11aSdp78419 * All the above use pfn's as the unit. 868ce8eb11aSdp78419 * 869ce8eb11aSdp78419 * As an example, the memory layout for a domain with 2 mblocks and 4 870ce8eb11aSdp78419 * mem_nodes 0,1,2,3 could look like this: 871ce8eb11aSdp78419 * 872ce8eb11aSdp78419 * 123012301230 ... 012301230123 ... 873ce8eb11aSdp78419 * mblock 0 mblock 1 874ce8eb11aSdp78419 */ 875ce8eb11aSdp78419 876ce8eb11aSdp78419 void 877ce8eb11aSdp78419 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 878ce8eb11aSdp78419 { 879ce8eb11aSdp78419 lgrp_handle_t lgrphand, lgrp_start; 880ce8eb11aSdp78419 int i, mnode, elem; 881ce8eb11aSdp78419 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 882ce8eb11aSdp78419 uint64_t stripe, frag, remove; 883ce8eb11aSdp78419 mem_stripe_t *ms; 884ce8eb11aSdp78419 885e853d8c3Sjc25722 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 886e853d8c3Sjc25722 max_mem_nodes = max_locality_groups; 887ce8eb11aSdp78419 888e853d8c3Sjc25722 /* Check for non-MPO sun4v platforms */ 889ce8eb11aSdp78419 if (n_locality_groups <= 1) { 890e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 891ce8eb11aSdp78419 for (elem = 0; elem < nelems; elem += 2) { 892ce8eb11aSdp78419 base = list[elem]; 893ce8eb11aSdp78419 len = list[elem+1]; 894ce8eb11aSdp78419 895ce8eb11aSdp78419 mpo_mem_node_add_slice(btop(base), 896ce8eb11aSdp78419 btop(base + len - 1)); 897ce8eb11aSdp78419 } 898ce8eb11aSdp78419 mem_node_pfn_shift = 0; 899ce8eb11aSdp78419 mem_node_physalign = 0; 900ce8eb11aSdp78419 n_mem_stripes = 0; 901e853d8c3Sjc25722 if (n_mblocks == 1) 902ce8eb11aSdp78419 return; 903ce8eb11aSdp78419 } 904ce8eb11aSdp78419 905bb57d1f5Sjc25722 bzero(mem_stripes, mstripesz); 906ce8eb11aSdp78419 stripe = ptob(mnode_pages); 907ce8eb11aSdp78419 stride = max_locality_groups * stripe; 908ce8eb11aSdp78419 909ce8eb11aSdp78419 /* Save commonly used values in globals */ 910ce8eb11aSdp78419 mnode_stride = btop(stride); 911ce8eb11aSdp78419 n_mem_stripes = max_locality_groups * n_mblocks; 912ce8eb11aSdp78419 stripe_shift = highbit(max_locality_groups) - 1; 913ce8eb11aSdp78419 914ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 915*924db11bSjc25722 mpo_mblock[i].mnode_mask = (mnodeset_t)0; 916ce8eb11aSdp78419 base = mpo_mblock[i].base; 917ce8eb11aSdp78419 end = mpo_mblock[i].base + mpo_mblock[i].size; 918ce8eb11aSdp78419 ra_to_pa = mpo_mblock[i].ra_to_pa; 919ce8eb11aSdp78419 mpo_mblock[i].base_pfn = btop(base); 920ce8eb11aSdp78419 mpo_mblock[i].end_pfn = btop(end - 1); 921ce8eb11aSdp78419 922ce8eb11aSdp78419 /* Find the offset from the prev stripe boundary in PA space. */ 923ce8eb11aSdp78419 offset = (base + ra_to_pa) & (stripe - 1); 924ce8eb11aSdp78419 925ce8eb11aSdp78419 /* Set the next stripe boundary. */ 926ce8eb11aSdp78419 stripe_end = base - offset + stripe; 927ce8eb11aSdp78419 928ce8eb11aSdp78419 lgrp_start = (((base + ra_to_pa) & home_mask) >> 929ce8eb11aSdp78419 home_mask_shift); 930ce8eb11aSdp78419 lgrphand = lgrp_start; 931ce8eb11aSdp78419 932ce8eb11aSdp78419 /* 933ce8eb11aSdp78419 * Loop over all lgroups covered by the mblock, creating a 934ce8eb11aSdp78419 * stripe for each. Stop when lgrp_start is visited again. 935ce8eb11aSdp78419 */ 936ce8eb11aSdp78419 do { 937ce8eb11aSdp78419 /* mblock may not span all lgroups */ 938ce8eb11aSdp78419 if (base >= end) 939ce8eb11aSdp78419 break; 940ce8eb11aSdp78419 941ce8eb11aSdp78419 mnode = lgrphand; 942ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 943bb57d1f5Sjc25722 mpo_mblock[i].mnode_mask |= (mnodeset_t)1 << mnode; 944ce8eb11aSdp78419 945ce8eb11aSdp78419 /* 946ce8eb11aSdp78419 * Calculate the size of the fragment that does not 947ce8eb11aSdp78419 * belong to the mnode in the last partial stride. 948ce8eb11aSdp78419 */ 949ce8eb11aSdp78419 frag = (end - (base - offset)) & (stride - 1); 950ce8eb11aSdp78419 if (frag == 0) { 951ce8eb11aSdp78419 /* remove the gap */ 952ce8eb11aSdp78419 remove = stride - stripe; 953ce8eb11aSdp78419 } else if (frag < stripe) { 954ce8eb11aSdp78419 /* fragment fits in stripe; keep it all */ 955ce8eb11aSdp78419 remove = 0; 956ce8eb11aSdp78419 } else { 957ce8eb11aSdp78419 /* fragment is large; trim after whole stripe */ 958ce8eb11aSdp78419 remove = frag - stripe; 959ce8eb11aSdp78419 } 960ce8eb11aSdp78419 961ce8eb11aSdp78419 ms = &mem_stripes[i * max_locality_groups + mnode]; 962ce8eb11aSdp78419 ms->physbase = btop(base); 963ce8eb11aSdp78419 ms->physmax = btop(end - 1 - remove); 964ce8eb11aSdp78419 ms->offset = btop(offset); 965ce8eb11aSdp78419 ms->exists = 1; 966ce8eb11aSdp78419 967e853d8c3Sjc25722 /* 968e853d8c3Sjc25722 * If we have only 1 lgroup and multiple mblocks, 969e853d8c3Sjc25722 * then we have already established our lgrp handle 970e853d8c3Sjc25722 * to mem_node and mem_node_config values above. 971e853d8c3Sjc25722 */ 972e853d8c3Sjc25722 if (n_locality_groups > 1) { 973e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 974e853d8c3Sjc25722 mnode); 975e853d8c3Sjc25722 mpo_mem_node_add_slice(ms->physbase, 976e853d8c3Sjc25722 ms->physmax); 977e853d8c3Sjc25722 } 978ce8eb11aSdp78419 base = stripe_end; 979ce8eb11aSdp78419 stripe_end += stripe; 980ce8eb11aSdp78419 offset = 0; 981ce8eb11aSdp78419 lgrphand = (((base + ra_to_pa) & home_mask) >> 982ce8eb11aSdp78419 home_mask_shift); 983ce8eb11aSdp78419 } while (lgrphand != lgrp_start); 984ce8eb11aSdp78419 } 985ce8eb11aSdp78419 986ce8eb11aSdp78419 /* 987ce8eb11aSdp78419 * Indicate to vm_pagelist that the hpm_counters array 988ce8eb11aSdp78419 * should be shared because the ranges overlap. 989ce8eb11aSdp78419 */ 990ce8eb11aSdp78419 if (max_mem_nodes > 1) { 991ce8eb11aSdp78419 interleaved_mnodes = 1; 992ce8eb11aSdp78419 } 993ce8eb11aSdp78419 } 994ce8eb11aSdp78419 995ce8eb11aSdp78419 /* 996ce8eb11aSdp78419 * Return the locality group value for the supplied processor 997ce8eb11aSdp78419 */ 998ce8eb11aSdp78419 lgrp_handle_t 999ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id) 1000ce8eb11aSdp78419 { 1001ce8eb11aSdp78419 if (n_locality_groups > 1) { 1002ce8eb11aSdp78419 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 1003ce8eb11aSdp78419 } else { 1004e853d8c3Sjc25722 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 1005ce8eb11aSdp78419 } 1006ce8eb11aSdp78419 } 1007ce8eb11aSdp78419 1008ce8eb11aSdp78419 int 1009ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 1010ce8eb11aSdp78419 { 1011ce8eb11aSdp78419 /* 1012ce8eb11aSdp78419 * Return min remote latency when there are more than two lgroups 1013ce8eb11aSdp78419 * (root and child) and getting latency between two different lgroups 1014ce8eb11aSdp78419 * or root is involved. 1015ce8eb11aSdp78419 */ 1016ce8eb11aSdp78419 if (lgrp_optimizations() && (from != to || 1017ce8eb11aSdp78419 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 1018ce8eb11aSdp78419 return ((int)higher_latency); 1019ce8eb11aSdp78419 } else { 1020ce8eb11aSdp78419 return ((int)lower_latency); 1021ce8eb11aSdp78419 } 1022ce8eb11aSdp78419 } 1023ce8eb11aSdp78419 1024ce8eb11aSdp78419 int 1025ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn) 1026ce8eb11aSdp78419 { 1027ce8eb11aSdp78419 int i, mnode; 1028ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 1029ce8eb11aSdp78419 struct mblock_md *mb; 1030ce8eb11aSdp78419 1031ce8eb11aSdp78419 if (n_locality_groups <= 1) 1032ce8eb11aSdp78419 return (0); 1033ce8eb11aSdp78419 1034ce8eb11aSdp78419 /* 1035ce8eb11aSdp78419 * The mnode is defined to be 1:1 with the lgroup handle, which 1036ce8eb11aSdp78419 * is taken from from the home bits. Find the mblock in which 1037ce8eb11aSdp78419 * the pfn falls to get the ra_to_pa adjustment, and extract 1038ce8eb11aSdp78419 * the home bits. 1039ce8eb11aSdp78419 */ 1040ce8eb11aSdp78419 mb = &mpo_mblock[0]; 1041ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1042ce8eb11aSdp78419 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 1043ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 1044ce8eb11aSdp78419 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 1045ce8eb11aSdp78419 home_mask_pfn_shift); 1046ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 1047ce8eb11aSdp78419 return (mnode); 1048ce8eb11aSdp78419 } 1049ce8eb11aSdp78419 mb++; 1050ce8eb11aSdp78419 } 1051ce8eb11aSdp78419 1052ce8eb11aSdp78419 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 1053ce8eb11aSdp78419 return (pfn); 1054ce8eb11aSdp78419 } 1055ce8eb11aSdp78419 1056ce8eb11aSdp78419 /* 1057ce8eb11aSdp78419 * plat_rapfn_to_papfn 1058ce8eb11aSdp78419 * 1059ce8eb11aSdp78419 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 1060ce8eb11aSdp78419 * and home mask bits are correct. The upper bits do not necessarily 1061ce8eb11aSdp78419 * match the actual PA, however. 1062ce8eb11aSdp78419 */ 1063ce8eb11aSdp78419 pfn_t 1064ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn) 1065ce8eb11aSdp78419 { 1066ce8eb11aSdp78419 int i; 1067ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 1068ce8eb11aSdp78419 struct mblock_md *mb; 1069ce8eb11aSdp78419 1070ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 1071ce8eb11aSdp78419 if (n_mblocks == 1) 1072ce8eb11aSdp78419 return (pfn + base_ra_to_pa_pfn); 1073ce8eb11aSdp78419 1074ce8eb11aSdp78419 /* 1075ce8eb11aSdp78419 * Find the mblock in which the pfn falls 1076ce8eb11aSdp78419 * in order to get the ra_to_pa adjustment. 1077ce8eb11aSdp78419 */ 1078ce8eb11aSdp78419 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 1079ce8eb11aSdp78419 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 1080ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 1081ce8eb11aSdp78419 return (pfn + ra_to_pa_pfn); 1082ce8eb11aSdp78419 } 1083ce8eb11aSdp78419 } 1084ce8eb11aSdp78419 1085ce8eb11aSdp78419 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 1086ce8eb11aSdp78419 return (pfn); 1087ce8eb11aSdp78419 } 1088ce8eb11aSdp78419 1089ce8eb11aSdp78419 /* 1090ce8eb11aSdp78419 * plat_mem_node_iterator_init() 1091ce8eb11aSdp78419 * Initialize cookie to iterate over pfn's in an mnode. There is 1092ce8eb11aSdp78419 * no additional iterator function. The caller uses the info from 1093ce8eb11aSdp78419 * the iterator structure directly. 1094ce8eb11aSdp78419 * 1095ce8eb11aSdp78419 * pfn: starting pfn. 1096ce8eb11aSdp78419 * mnode: desired mnode. 1097ce8eb11aSdp78419 * init: set to 1 for full init, 0 for continuation 1098ce8eb11aSdp78419 * 1099ce8eb11aSdp78419 * Returns the appropriate starting pfn for the iteration 1100ce8eb11aSdp78419 * the same as the input pfn if it falls in an mblock. 1101ce8eb11aSdp78419 * Returns the (pfn_t)-1 value if the input pfn lies past 1102ce8eb11aSdp78419 * the last valid mnode pfn. 1103ce8eb11aSdp78419 */ 1104ce8eb11aSdp78419 pfn_t 1105ce8eb11aSdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 1106ce8eb11aSdp78419 mem_node_iterator_t *it, int init) 1107ce8eb11aSdp78419 { 1108ce8eb11aSdp78419 int i; 1109ce8eb11aSdp78419 struct mblock_md *mblock; 1110ce8eb11aSdp78419 pfn_t base, end; 1111ce8eb11aSdp78419 1112ce8eb11aSdp78419 ASSERT(it != NULL); 1113ce8eb11aSdp78419 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1114ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 1115ce8eb11aSdp78419 1116ce8eb11aSdp78419 if (init) { 1117ce8eb11aSdp78419 it->mi_last_mblock = 0; 1118ce8eb11aSdp78419 it->mi_init = 1; 1119ce8eb11aSdp78419 } 1120ce8eb11aSdp78419 1121ce8eb11aSdp78419 /* Check if mpo is not enabled and we only have one mblock */ 1122ce8eb11aSdp78419 if (n_locality_groups == 1 && n_mblocks == 1) { 1123ce8eb11aSdp78419 it->mi_mnode = mnode; 1124ce8eb11aSdp78419 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1125ce8eb11aSdp78419 it->mi_mnode_pfn_mask = 0; 1126ce8eb11aSdp78419 it->mi_mnode_pfn_shift = 0; 1127ce8eb11aSdp78419 it->mi_mnode_mask = 0; 1128ce8eb11aSdp78419 it->mi_mblock_base = mem_node_config[mnode].physbase; 1129ce8eb11aSdp78419 it->mi_mblock_end = mem_node_config[mnode].physmax; 1130ce8eb11aSdp78419 if (pfn < it->mi_mblock_base) 1131ce8eb11aSdp78419 pfn = it->mi_mblock_base; 1132ce8eb11aSdp78419 else if (pfn > it->mi_mblock_end) 1133ce8eb11aSdp78419 pfn = (pfn_t)-1; 1134ce8eb11aSdp78419 return (pfn); 1135ce8eb11aSdp78419 } 1136ce8eb11aSdp78419 1137ce8eb11aSdp78419 /* 1138ce8eb11aSdp78419 * Find mblock that contains pfn, or first mblock after pfn, 1139ce8eb11aSdp78419 * else pfn is out of bounds, so use the last mblock. 1140ce8eb11aSdp78419 * mblocks are sorted in ascending address order. 1141ce8eb11aSdp78419 */ 1142ce8eb11aSdp78419 ASSERT(it->mi_last_mblock < n_mblocks); 1143ce8eb11aSdp78419 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 1144ce8eb11aSdp78419 i = init ? 0 : it->mi_last_mblock + 1; 1145ce8eb11aSdp78419 if (i == n_mblocks) 1146ce8eb11aSdp78419 return ((pfn_t)-1); 1147ce8eb11aSdp78419 1148ce8eb11aSdp78419 for (; i < n_mblocks; i++) { 1149bb57d1f5Sjc25722 if ((mpo_mblock[i].mnode_mask & ((mnodeset_t)1 << mnode)) && 1150bb57d1f5Sjc25722 (pfn <= mpo_mblock[i].end_pfn)) 1151ce8eb11aSdp78419 break; 1152ce8eb11aSdp78419 } 1153ce8eb11aSdp78419 if (i == n_mblocks) { 1154ce8eb11aSdp78419 it->mi_last_mblock = i - 1; 1155ce8eb11aSdp78419 return ((pfn_t)-1); 1156ce8eb11aSdp78419 } 1157ce8eb11aSdp78419 it->mi_last_mblock = i; 1158ce8eb11aSdp78419 1159ce8eb11aSdp78419 /* 1160ce8eb11aSdp78419 * Memory stripes are defined if there is more than one locality 1161ce8eb11aSdp78419 * group, so use the stripe bounds. Otherwise use mblock bounds. 1162ce8eb11aSdp78419 */ 1163ce8eb11aSdp78419 mblock = &mpo_mblock[i]; 1164ce8eb11aSdp78419 if (n_mem_stripes > 0) { 1165ce8eb11aSdp78419 mem_stripe_t *ms = 1166ce8eb11aSdp78419 &mem_stripes[i * max_locality_groups + mnode]; 1167ce8eb11aSdp78419 base = ms->physbase; 1168ce8eb11aSdp78419 end = ms->physmax; 1169ce8eb11aSdp78419 } else { 1170ce8eb11aSdp78419 ASSERT(mnode == 0); 1171ce8eb11aSdp78419 base = mblock->base_pfn; 1172ce8eb11aSdp78419 end = mblock->end_pfn; 1173ce8eb11aSdp78419 } 1174ce8eb11aSdp78419 1175ce8eb11aSdp78419 it->mi_mnode = mnode; 1176ce8eb11aSdp78419 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1177ce8eb11aSdp78419 it->mi_mblock_base = base; 1178ce8eb11aSdp78419 it->mi_mblock_end = end; 1179ce8eb11aSdp78419 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1180ce8eb11aSdp78419 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1181ce8eb11aSdp78419 it->mi_mnode_mask = max_locality_groups - 1; 1182ce8eb11aSdp78419 if (pfn < base) 1183ce8eb11aSdp78419 pfn = base; 1184ce8eb11aSdp78419 else if (pfn > end) 1185ce8eb11aSdp78419 pfn = (pfn_t)-1; 1186ce8eb11aSdp78419 return (pfn); 1187ce8eb11aSdp78419 } 1188ce8eb11aSdp78419 1189ce8eb11aSdp78419 /* 1190ce8eb11aSdp78419 * plat_mem_node_intersect_range() 1191ce8eb11aSdp78419 * 1192ce8eb11aSdp78419 * Find the intersection between a memnode and a range of pfn's. 1193ce8eb11aSdp78419 */ 1194ce8eb11aSdp78419 void 1195ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1196ce8eb11aSdp78419 int mnode, pgcnt_t *npages_out) 1197ce8eb11aSdp78419 { 1198ce8eb11aSdp78419 pfn_t offset, len, hole, base, end, test_end, frag; 1199ce8eb11aSdp78419 pfn_t nearest; 1200ce8eb11aSdp78419 mem_stripe_t *ms; 1201ce8eb11aSdp78419 int i, npages; 1202ce8eb11aSdp78419 1203ce8eb11aSdp78419 *npages_out = 0; 1204ce8eb11aSdp78419 1205ce8eb11aSdp78419 if (!mem_node_config[mnode].exists || test_len == 0) 1206ce8eb11aSdp78419 return; 1207ce8eb11aSdp78419 1208ce8eb11aSdp78419 base = mem_node_config[mnode].physbase; 1209ce8eb11aSdp78419 end = mem_node_config[mnode].physmax; 1210ce8eb11aSdp78419 1211ce8eb11aSdp78419 test_end = test_base + test_len - 1; 1212ce8eb11aSdp78419 if (end < test_base || base > test_end) 1213ce8eb11aSdp78419 return; 1214ce8eb11aSdp78419 1215ce8eb11aSdp78419 if (n_locality_groups == 1) { 1216ce8eb11aSdp78419 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1217ce8eb11aSdp78419 return; 1218ce8eb11aSdp78419 } 1219ce8eb11aSdp78419 1220ce8eb11aSdp78419 hole = mnode_stride - mnode_pages; 1221ce8eb11aSdp78419 npages = 0; 1222ce8eb11aSdp78419 1223ce8eb11aSdp78419 /* 1224ce8eb11aSdp78419 * Iterate over all the stripes for this mnode (one per mblock), 1225ce8eb11aSdp78419 * find the intersection with each, and accumulate the intersections. 1226ce8eb11aSdp78419 * 1227ce8eb11aSdp78419 * Determing the intersection with a stripe is tricky. If base or end 1228ce8eb11aSdp78419 * fall outside the mem_node bounds, round them to physbase/physmax of 1229ce8eb11aSdp78419 * mem_node. If base or end fall in a gap, round them to start of 1230ce8eb11aSdp78419 * nearest stripe. If they fall within a stripe, keep base or end, 1231ce8eb11aSdp78419 * but calculate the fragment size that should be excluded from the 1232ce8eb11aSdp78419 * stripe. Calculate how many strides fall in the adjusted range, 1233ce8eb11aSdp78419 * multiply by stripe width, and add the start and end fragments. 1234ce8eb11aSdp78419 */ 1235ce8eb11aSdp78419 1236ce8eb11aSdp78419 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1237ce8eb11aSdp78419 ms = &mem_stripes[i]; 1238ce8eb11aSdp78419 if (ms->exists && 1239ce8eb11aSdp78419 test_base <= (end = ms->physmax) && 1240ce8eb11aSdp78419 test_end >= (base = ms->physbase)) { 1241ce8eb11aSdp78419 1242ce8eb11aSdp78419 offset = ms->offset; 1243ce8eb11aSdp78419 1244ce8eb11aSdp78419 if (test_base > base) { 1245ce8eb11aSdp78419 /* Round test_base to next multiple of stride */ 1246ce8eb11aSdp78419 len = P2ROUNDUP(test_base - (base - offset), 1247ce8eb11aSdp78419 mnode_stride); 1248ce8eb11aSdp78419 nearest = base - offset + len; 1249ce8eb11aSdp78419 /* 1250ce8eb11aSdp78419 * Compute distance from test_base to the 1251ce8eb11aSdp78419 * stride boundary to see if test_base falls 1252ce8eb11aSdp78419 * in the stripe or in the hole. 1253ce8eb11aSdp78419 */ 1254ce8eb11aSdp78419 if (nearest - test_base > hole) { 1255ce8eb11aSdp78419 /* 1256ce8eb11aSdp78419 * test_base lies in stripe, 1257ce8eb11aSdp78419 * and offset should be excluded. 1258ce8eb11aSdp78419 */ 1259ce8eb11aSdp78419 offset = test_base - 1260ce8eb11aSdp78419 (nearest - mnode_stride); 1261ce8eb11aSdp78419 base = test_base; 1262ce8eb11aSdp78419 } else { 1263ce8eb11aSdp78419 /* round up to next stripe start */ 1264ce8eb11aSdp78419 offset = 0; 1265ce8eb11aSdp78419 base = nearest; 1266ce8eb11aSdp78419 if (base > end) 1267ce8eb11aSdp78419 continue; 1268ce8eb11aSdp78419 } 1269ce8eb11aSdp78419 1270ce8eb11aSdp78419 } 1271ce8eb11aSdp78419 1272ce8eb11aSdp78419 if (test_end < end) 1273ce8eb11aSdp78419 end = test_end; 1274ce8eb11aSdp78419 end++; /* adjust to an exclusive bound */ 1275ce8eb11aSdp78419 1276ce8eb11aSdp78419 /* Round end to next multiple of stride */ 1277ce8eb11aSdp78419 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1278ce8eb11aSdp78419 nearest = (base - offset) + len; 1279ce8eb11aSdp78419 if (nearest - end <= hole) { 1280ce8eb11aSdp78419 /* end falls in hole, use entire last stripe */ 1281ce8eb11aSdp78419 frag = 0; 1282ce8eb11aSdp78419 } else { 1283ce8eb11aSdp78419 /* end falls in stripe, compute fragment */ 1284ce8eb11aSdp78419 frag = nearest - hole - end; 1285ce8eb11aSdp78419 } 1286ce8eb11aSdp78419 1287ce8eb11aSdp78419 len = (len >> stripe_shift) - offset - frag; 1288ce8eb11aSdp78419 npages += len; 1289ce8eb11aSdp78419 } 1290ce8eb11aSdp78419 } 1291ce8eb11aSdp78419 1292ce8eb11aSdp78419 *npages_out = npages; 1293ce8eb11aSdp78419 } 1294ce8eb11aSdp78419 1295ce8eb11aSdp78419 /* 1296ce8eb11aSdp78419 * valid_pages() 1297ce8eb11aSdp78419 * 1298ce8eb11aSdp78419 * Return 1 if pages are valid and do not cross mnode boundaries 1299ce8eb11aSdp78419 * (which would break page free list assumptions), and 0 otherwise. 1300ce8eb11aSdp78419 */ 1301ce8eb11aSdp78419 1302ce8eb11aSdp78419 #define MNODE(pa) \ 1303ce8eb11aSdp78419 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1304ce8eb11aSdp78419 1305ce8eb11aSdp78419 static int 1306ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0) 1307ce8eb11aSdp78419 { 1308ce8eb11aSdp78419 int i, max_szc; 1309ce8eb11aSdp78419 uint64_t last_page_base, szc_mask; 1310ce8eb11aSdp78419 uint64_t max_page_len, max_coalesce_len; 1311ce8eb11aSdp78419 struct mblock_md *mb = mpo_mblock; 1312ce8eb11aSdp78419 1313ce8eb11aSdp78419 /* 1314ce8eb11aSdp78419 * Find the smaller of the largest page possible and supported. 1315ce8eb11aSdp78419 * mmu_exported_pagesize_mask is not yet initialized, so read 1316ce8eb11aSdp78419 * it from the MD. Apply minimal fixups in case of broken MDs 1317ce8eb11aSdp78419 * to get a sane mask. 1318ce8eb11aSdp78419 */ 1319ce8eb11aSdp78419 1320ce8eb11aSdp78419 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1321ce8eb11aSdp78419 szc_mask = 0; 1322ce8eb11aSdp78419 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1323ce8eb11aSdp78419 max_szc = highbit(szc_mask) - 1; 1324ce8eb11aSdp78419 if (max_szc > TTE256M) 1325ce8eb11aSdp78419 max_szc = TTE256M; 1326ce8eb11aSdp78419 max_page_len = TTEBYTES(max_szc); 1327ce8eb11aSdp78419 1328ce8eb11aSdp78419 /* 1329ce8eb11aSdp78419 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1330ce8eb11aSdp78419 * if mmu-page-size-list does not contain it, so 256M pages must fall 1331ce8eb11aSdp78419 * within one mnode to use MPO. 1332ce8eb11aSdp78419 */ 1333ce8eb11aSdp78419 max_coalesce_len = TTEBYTES(TTE256M); 1334ce8eb11aSdp78419 ASSERT(max_coalesce_len >= max_page_len); 1335ce8eb11aSdp78419 1336ce8eb11aSdp78419 if (ptob(mnode_pages) < max_coalesce_len) { 1337ce8eb11aSdp78419 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1338ce8eb11aSdp78419 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1339ce8eb11aSdp78419 return (0); 1340ce8eb11aSdp78419 } 1341ce8eb11aSdp78419 1342ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1343ce8eb11aSdp78419 uint64_t base = mb->base; 1344ce8eb11aSdp78419 uint64_t end = mb->base + mb->size - 1; 1345ce8eb11aSdp78419 uint64_t ra_to_pa = mb->ra_to_pa; 1346ce8eb11aSdp78419 1347ce8eb11aSdp78419 /* 1348ce8eb11aSdp78419 * If mblock is smaller than the max page size, then 1349ce8eb11aSdp78419 * RA = PA mod MAXPAGE is not guaranteed, but it must 1350ce8eb11aSdp78419 * not span mnodes. 1351ce8eb11aSdp78419 */ 1352ce8eb11aSdp78419 if (mb->size < max_page_len) { 1353ce8eb11aSdp78419 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1354ce8eb11aSdp78419 MPO_STATUS("Small mblock spans mnodes; " 1355ce8eb11aSdp78419 "MPO disabled: base = %lx, end = %lx, " 1356ce8eb11aSdp78419 "ra2pa = %lx\n", base, end, ra_to_pa); 1357ce8eb11aSdp78419 return (0); 1358ce8eb11aSdp78419 } 1359ce8eb11aSdp78419 } else { 1360ce8eb11aSdp78419 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1361ce8eb11aSdp78419 uint64_t pa_base = base + ra_to_pa; 1362ce8eb11aSdp78419 if ((base & (max_coalesce_len - 1)) != 1363ce8eb11aSdp78419 (pa_base & (max_coalesce_len - 1))) { 1364ce8eb11aSdp78419 MPO_STATUS("bad page alignment; MPO disabled: " 1365ce8eb11aSdp78419 "ra = %lx, pa = %lx, pagelen = %lx\n", 1366ce8eb11aSdp78419 base, pa_base, max_coalesce_len); 1367ce8eb11aSdp78419 return (0); 1368ce8eb11aSdp78419 } 1369ce8eb11aSdp78419 } 1370ce8eb11aSdp78419 1371ce8eb11aSdp78419 /* 1372ce8eb11aSdp78419 * Find start of last large page in mblock in RA space. 1373ce8eb11aSdp78419 * If page extends into the next mblock, verify the 1374ce8eb11aSdp78419 * mnode does not change. 1375ce8eb11aSdp78419 */ 1376ce8eb11aSdp78419 last_page_base = P2ALIGN(end, max_coalesce_len); 1377ce8eb11aSdp78419 if (i + 1 < n_mblocks && 1378ce8eb11aSdp78419 last_page_base + max_coalesce_len > mb[1].base && 1379ce8eb11aSdp78419 MNODE(last_page_base + ra_to_pa) != 1380ce8eb11aSdp78419 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1381ce8eb11aSdp78419 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1382ce8eb11aSdp78419 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1383ce8eb11aSdp78419 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1384ce8eb11aSdp78419 mb[1].ra_to_pa, max_coalesce_len); 1385ce8eb11aSdp78419 return (0); 1386ce8eb11aSdp78419 } 1387ce8eb11aSdp78419 1388ce8eb11aSdp78419 mb++; 1389ce8eb11aSdp78419 } 1390ce8eb11aSdp78419 return (1); 1391ce8eb11aSdp78419 } 1392ce8eb11aSdp78419 1393ce8eb11aSdp78419 1394ce8eb11aSdp78419 /* 1395ce8eb11aSdp78419 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1396ce8eb11aSdp78419 * if any, and remove them. This yields a config where the "coarse 1397ce8eb11aSdp78419 * grained" lgroups cover all of memory, even though part of that memory 1398ce8eb11aSdp78419 * is fine grain interleaved and does not deliver a purely local memory 1399ce8eb11aSdp78419 * latency. 1400ce8eb11aSdp78419 * 1401ce8eb11aSdp78419 * This function reads and modifies the globals: 1402ce8eb11aSdp78419 * mpo_lgroup[], n_lgrpnodes 1403ce8eb11aSdp78419 * 1404ce8eb11aSdp78419 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1405ce8eb11aSdp78419 */ 1406ce8eb11aSdp78419 1407ce8eb11aSdp78419 static int 1408ce8eb11aSdp78419 fix_interleave(void) 1409ce8eb11aSdp78419 { 1410ce8eb11aSdp78419 int i, j; 1411ce8eb11aSdp78419 uint64_t mask = 0; 1412ce8eb11aSdp78419 1413ce8eb11aSdp78419 j = 0; 1414ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 1415ce8eb11aSdp78419 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1416ce8eb11aSdp78419 /* remove this lgroup */ 1417ce8eb11aSdp78419 mask = mpo_lgroup[i].addr_mask; 1418ce8eb11aSdp78419 } else { 1419ce8eb11aSdp78419 mpo_lgroup[j++] = mpo_lgroup[i]; 1420ce8eb11aSdp78419 } 1421ce8eb11aSdp78419 } 1422ce8eb11aSdp78419 n_lgrpnodes = j; 1423ce8eb11aSdp78419 1424ce8eb11aSdp78419 if (mask != 0) 1425ce8eb11aSdp78419 MPO_STATUS("sub-page interleave %lx found; " 1426ce8eb11aSdp78419 "removing lgroup.\n", mask); 1427ce8eb11aSdp78419 1428ce8eb11aSdp78419 return (mask != 0); 1429ce8eb11aSdp78419 } 1430