1*ce8eb11aSdp78419 /* 2*ce8eb11aSdp78419 * CDDL HEADER START 3*ce8eb11aSdp78419 * 4*ce8eb11aSdp78419 * The contents of this file are subject to the terms of the 5*ce8eb11aSdp78419 * Common Development and Distribution License (the "License"). 6*ce8eb11aSdp78419 * You may not use this file except in compliance with the License. 7*ce8eb11aSdp78419 * 8*ce8eb11aSdp78419 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*ce8eb11aSdp78419 * or http://www.opensolaris.org/os/licensing. 10*ce8eb11aSdp78419 * See the License for the specific language governing permissions 11*ce8eb11aSdp78419 * and limitations under the License. 12*ce8eb11aSdp78419 * 13*ce8eb11aSdp78419 * When distributing Covered Code, include this CDDL HEADER in each 14*ce8eb11aSdp78419 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*ce8eb11aSdp78419 * If applicable, add the following below this CDDL HEADER, with the 16*ce8eb11aSdp78419 * fields enclosed by brackets "[]" replaced with your own identifying 17*ce8eb11aSdp78419 * information: Portions Copyright [yyyy] [name of copyright owner] 18*ce8eb11aSdp78419 * 19*ce8eb11aSdp78419 * CDDL HEADER END 20*ce8eb11aSdp78419 */ 21*ce8eb11aSdp78419 22*ce8eb11aSdp78419 /* 23*ce8eb11aSdp78419 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24*ce8eb11aSdp78419 * Use is subject to license terms. 25*ce8eb11aSdp78419 */ 26*ce8eb11aSdp78419 27*ce8eb11aSdp78419 #pragma ident "%Z%%M% %I% %E% SMI" 28*ce8eb11aSdp78419 29*ce8eb11aSdp78419 #include <sys/types.h> 30*ce8eb11aSdp78419 #include <sys/sysmacros.h> 31*ce8eb11aSdp78419 #include <sys/machsystm.h> 32*ce8eb11aSdp78419 #include <sys/machparam.h> 33*ce8eb11aSdp78419 #include <sys/cmn_err.h> 34*ce8eb11aSdp78419 #include <sys/stat.h> 35*ce8eb11aSdp78419 #include <sys/mach_descrip.h> 36*ce8eb11aSdp78419 #include <sys/memnode.h> 37*ce8eb11aSdp78419 #include <sys/mdesc.h> 38*ce8eb11aSdp78419 #include <sys/mpo.h> 39*ce8eb11aSdp78419 #include <vm/vm_dep.h> 40*ce8eb11aSdp78419 41*ce8eb11aSdp78419 /* 42*ce8eb11aSdp78419 * MPO and the sun4v memory representation 43*ce8eb11aSdp78419 * --------------------------------------- 44*ce8eb11aSdp78419 * 45*ce8eb11aSdp78419 * Latency groups are defined in the sun4v achitecture by memory-latency-group 46*ce8eb11aSdp78419 * nodes in the Machine Description, as specified in FWARC/2007/260. These 47*ce8eb11aSdp78419 * tie together cpu nodes and mblock nodes, and contain mask and match 48*ce8eb11aSdp78419 * properties that identify the portion of an mblock that belongs to the 49*ce8eb11aSdp78419 * lgroup. Mask and match are defined in the Physical Address (PA) space, 50*ce8eb11aSdp78419 * but an mblock defines Real Addresses (RA). To translate, the mblock 51*ce8eb11aSdp78419 * includes the property address-congruence-offset, hereafter referred to as 52*ce8eb11aSdp78419 * ra_to_pa. A real address ra is a member of an lgroup if 53*ce8eb11aSdp78419 * 54*ce8eb11aSdp78419 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 55*ce8eb11aSdp78419 * 56*ce8eb11aSdp78419 * The MD is traversed, and information on all mblocks is kept in the array 57*ce8eb11aSdp78419 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 58*ce8eb11aSdp78419 * to, is kept in the array mpo_cpu[]. 59*ce8eb11aSdp78419 * 60*ce8eb11aSdp78419 * This implementation makes (and verifies) the simplifying assumption that 61*ce8eb11aSdp78419 * the mask bits are the same for all defined lgroups, and that all 1 bits in 62*ce8eb11aSdp78419 * the mask are contiguous. Thus the number of lgroups is bounded by the 63*ce8eb11aSdp78419 * number of possible mask values, and the lgrp_handle_t is defined as the 64*ce8eb11aSdp78419 * mask value, shifted right to eliminate the 0 bit positions in mask. The 65*ce8eb11aSdp78419 * masks and values are also referred to as "home bits" in the code. 66*ce8eb11aSdp78419 * 67*ce8eb11aSdp78419 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 68*ce8eb11aSdp78419 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 69*ce8eb11aSdp78419 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 70*ce8eb11aSdp78419 * home bits. This yields the mem_node. 71*ce8eb11aSdp78419 * 72*ce8eb11aSdp78419 * Interfaces 73*ce8eb11aSdp78419 * ---------- 74*ce8eb11aSdp78419 * 75*ce8eb11aSdp78419 * This file exports the following entry points: 76*ce8eb11aSdp78419 * 77*ce8eb11aSdp78419 * plat_lgrp_init() 78*ce8eb11aSdp78419 * plat_build_mem_nodes() 79*ce8eb11aSdp78419 * plat_lgrp_cpu_to_hand() 80*ce8eb11aSdp78419 * plat_lgrp_latency() 81*ce8eb11aSdp78419 * plat_pfn_to_mem_node() 82*ce8eb11aSdp78419 * These implement the usual platform lgroup interfaces. 83*ce8eb11aSdp78419 * 84*ce8eb11aSdp78419 * plat_rapfn_to_papfn() 85*ce8eb11aSdp78419 * Recover the PA page coloring bits from an RA. 86*ce8eb11aSdp78419 * 87*ce8eb11aSdp78419 * plat_mem_node_iterator_init() 88*ce8eb11aSdp78419 * Initialize an iterator to efficiently step through pages in a mem_node. 89*ce8eb11aSdp78419 * 90*ce8eb11aSdp78419 * plat_mem_node_intersect_range() 91*ce8eb11aSdp78419 * Find the intersection with a mem_node. 92*ce8eb11aSdp78419 */ 93*ce8eb11aSdp78419 94*ce8eb11aSdp78419 int sun4v_mpo_enable = 1; 95*ce8eb11aSdp78419 int sun4v_mpo_debug = 0; 96*ce8eb11aSdp78419 char sun4v_mpo_status[256] = ""; 97*ce8eb11aSdp78419 98*ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */ 99*ce8eb11aSdp78419 static struct cpu_md mpo_cpu[NCPU]; 100*ce8eb11aSdp78419 101*ce8eb11aSdp78419 /* Save lgroup info from the MD */ 102*ce8eb11aSdp78419 #define MAX_MD_LGROUPS 32 103*ce8eb11aSdp78419 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 104*ce8eb11aSdp78419 static int n_lgrpnodes = 0; 105*ce8eb11aSdp78419 static int n_locality_groups = 0; 106*ce8eb11aSdp78419 static int max_locality_groups = 0; 107*ce8eb11aSdp78419 108*ce8eb11aSdp78419 /* Save mblocks from the MD */ 109*ce8eb11aSdp78419 static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS]; 110*ce8eb11aSdp78419 static int n_mblocks = 0; 111*ce8eb11aSdp78419 112*ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */ 113*ce8eb11aSdp78419 static mem_stripe_t mem_stripes[MAX_MEM_STRIPES]; 114*ce8eb11aSdp78419 static int n_mem_stripes = 0; 115*ce8eb11aSdp78419 static pfn_t mnode_stride; /* distance between stripes, start to start */ 116*ce8eb11aSdp78419 static int stripe_shift; /* stride/stripes expressed as a shift */ 117*ce8eb11aSdp78419 static pfn_t mnode_pages; /* mem_node stripe width */ 118*ce8eb11aSdp78419 119*ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */ 120*ce8eb11aSdp78419 static uint64_t home_mask = 0; 121*ce8eb11aSdp78419 static pfn_t home_mask_pfn = 0; 122*ce8eb11aSdp78419 static int home_mask_shift = 0; 123*ce8eb11aSdp78419 static uint_t home_mask_pfn_shift = 0; 124*ce8eb11aSdp78419 125*ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */ 126*ce8eb11aSdp78419 static int lower_latency = 0; 127*ce8eb11aSdp78419 static int higher_latency = 0; 128*ce8eb11aSdp78419 129*ce8eb11aSdp78419 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 130*ce8eb11aSdp78419 131*ce8eb11aSdp78419 static int valid_pages(md_t *md, mde_cookie_t cpu0); 132*ce8eb11aSdp78419 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 133*ce8eb11aSdp78419 static int fix_interleave(void); 134*ce8eb11aSdp78419 135*ce8eb11aSdp78419 /* Debug support */ 136*ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 137*ce8eb11aSdp78419 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 138*ce8eb11aSdp78419 #else 139*ce8eb11aSdp78419 #define MPO_DEBUG(...) 140*ce8eb11aSdp78419 #endif /* DEBUG */ 141*ce8eb11aSdp78419 142*ce8eb11aSdp78419 /* Record status message, viewable from mdb */ 143*ce8eb11aSdp78419 #define MPO_STATUS(args...) { \ 144*ce8eb11aSdp78419 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 145*ce8eb11aSdp78419 MPO_DEBUG(sun4v_mpo_status); \ 146*ce8eb11aSdp78419 } 147*ce8eb11aSdp78419 148*ce8eb11aSdp78419 /* 149*ce8eb11aSdp78419 * Routine to read a uint64_t from a given md 150*ce8eb11aSdp78419 */ 151*ce8eb11aSdp78419 static int64_t 152*ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 153*ce8eb11aSdp78419 { 154*ce8eb11aSdp78419 int err = md_get_prop_val(md, node, propname, val); 155*ce8eb11aSdp78419 return (err); 156*ce8eb11aSdp78419 } 157*ce8eb11aSdp78419 158*ce8eb11aSdp78419 static int 159*ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b) 160*ce8eb11aSdp78419 { 161*ce8eb11aSdp78419 struct mblock_md *m1 = (struct mblock_md *)a; 162*ce8eb11aSdp78419 struct mblock_md *m2 = (struct mblock_md *)b; 163*ce8eb11aSdp78419 164*ce8eb11aSdp78419 if (m1->base < m2->base) 165*ce8eb11aSdp78419 return (-1); 166*ce8eb11aSdp78419 else if (m1->base == m2->base) 167*ce8eb11aSdp78419 return (0); 168*ce8eb11aSdp78419 else 169*ce8eb11aSdp78419 return (1); 170*ce8eb11aSdp78419 } 171*ce8eb11aSdp78419 172*ce8eb11aSdp78419 static void 173*ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n) 174*ce8eb11aSdp78419 { 175*ce8eb11aSdp78419 extern void qsort(void *, size_t, size_t, 176*ce8eb11aSdp78419 int (*)(const void *, const void *)); 177*ce8eb11aSdp78419 178*ce8eb11aSdp78419 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 179*ce8eb11aSdp78419 } 180*ce8eb11aSdp78419 181*ce8eb11aSdp78419 /* 182*ce8eb11aSdp78419 * 183*ce8eb11aSdp78419 * Traverse the MD to determine: 184*ce8eb11aSdp78419 * 185*ce8eb11aSdp78419 * Number of CPU nodes, lgrp_nodes, and mblocks 186*ce8eb11aSdp78419 * Then for each lgrp_node, obtain the appropriate data. 187*ce8eb11aSdp78419 * For each CPU, determine its home locality and store it. 188*ce8eb11aSdp78419 * For each mblock, retrieve its data and store it. 189*ce8eb11aSdp78419 */ 190*ce8eb11aSdp78419 static int 191*ce8eb11aSdp78419 lgrp_traverse(md_t *md) 192*ce8eb11aSdp78419 { 193*ce8eb11aSdp78419 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 194*ce8eb11aSdp78419 uint64_t i, j, k, o, n_nodes; 195*ce8eb11aSdp78419 uint64_t n_lgroups = 0; 196*ce8eb11aSdp78419 uint64_t mem_lg_homeset = 0; 197*ce8eb11aSdp78419 int ret_val = 0; 198*ce8eb11aSdp78419 int result = 0; 199*ce8eb11aSdp78419 int n_cpunodes = 0; 200*ce8eb11aSdp78419 int sub_page_fix; 201*ce8eb11aSdp78419 202*ce8eb11aSdp78419 n_nodes = md_node_count(md); 203*ce8eb11aSdp78419 204*ce8eb11aSdp78419 if (n_nodes <= 0) { 205*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 206*ce8eb11aSdp78419 ret_val = -1; 207*ce8eb11aSdp78419 goto fail; 208*ce8eb11aSdp78419 } 209*ce8eb11aSdp78419 210*ce8eb11aSdp78419 root = md_root_node(md); 211*ce8eb11aSdp78419 212*ce8eb11aSdp78419 if (root == MDE_INVAL_ELEM_COOKIE) { 213*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 214*ce8eb11aSdp78419 ret_val = -1; 215*ce8eb11aSdp78419 goto fail; 216*ce8eb11aSdp78419 } 217*ce8eb11aSdp78419 218*ce8eb11aSdp78419 /* 219*ce8eb11aSdp78419 * Build the Memory Nodes. Do this before any possibility of 220*ce8eb11aSdp78419 * bailing from this routine so we obtain ra_to_pa (needed for page 221*ce8eb11aSdp78419 * coloring) even when there are no lgroups defined. 222*ce8eb11aSdp78419 */ 223*ce8eb11aSdp78419 224*ce8eb11aSdp78419 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 225*ce8eb11aSdp78419 "fwd", &mblocknodes); 226*ce8eb11aSdp78419 227*ce8eb11aSdp78419 if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) { 228*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No mblock " 229*ce8eb11aSdp78419 "nodes detected in Machine Descriptor\n"); 230*ce8eb11aSdp78419 n_mblocks = 0; 231*ce8eb11aSdp78419 ret_val = -1; 232*ce8eb11aSdp78419 goto fail; 233*ce8eb11aSdp78419 } 234*ce8eb11aSdp78419 235*ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 236*ce8eb11aSdp78419 mpo_mblock[i].node = mblocknodes[i]; 237*ce8eb11aSdp78419 238*ce8eb11aSdp78419 /* Without a base or size value we will fail */ 239*ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 240*ce8eb11aSdp78419 &mpo_mblock[i].base); 241*ce8eb11aSdp78419 if (result < 0) { 242*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 243*ce8eb11aSdp78419 "PROP_LG_BASE is missing\n"); 244*ce8eb11aSdp78419 n_mblocks = 0; 245*ce8eb11aSdp78419 ret_val = -1; 246*ce8eb11aSdp78419 goto fail; 247*ce8eb11aSdp78419 } 248*ce8eb11aSdp78419 249*ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 250*ce8eb11aSdp78419 &mpo_mblock[i].size); 251*ce8eb11aSdp78419 if (result < 0) { 252*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 253*ce8eb11aSdp78419 "PROP_LG_SIZE is missing\n"); 254*ce8eb11aSdp78419 n_mblocks = 0; 255*ce8eb11aSdp78419 ret_val = -1; 256*ce8eb11aSdp78419 goto fail; 257*ce8eb11aSdp78419 } 258*ce8eb11aSdp78419 259*ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], 260*ce8eb11aSdp78419 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 261*ce8eb11aSdp78419 262*ce8eb11aSdp78419 /* If we don't have an ra_pa_offset, just set it to 0 */ 263*ce8eb11aSdp78419 if (result < 0) 264*ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa = 0; 265*ce8eb11aSdp78419 266*ce8eb11aSdp78419 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 267*ce8eb11aSdp78419 "ra_to_pa = %lx\n", i, 268*ce8eb11aSdp78419 mpo_mblock[i].base, 269*ce8eb11aSdp78419 mpo_mblock[i].size, 270*ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa); 271*ce8eb11aSdp78419 } 272*ce8eb11aSdp78419 273*ce8eb11aSdp78419 /* Must sort mblocks by address for mem_node_iterator_init() */ 274*ce8eb11aSdp78419 mblock_sort(mpo_mblock, n_mblocks); 275*ce8eb11aSdp78419 276*ce8eb11aSdp78419 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 277*ce8eb11aSdp78419 278*ce8eb11aSdp78419 /* Page coloring hook is required so we can iterate through mnodes */ 279*ce8eb11aSdp78419 if (&page_next_pfn_for_color_cpu == NULL) { 280*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 281*ce8eb11aSdp78419 ret_val = -1; 282*ce8eb11aSdp78419 goto fail; 283*ce8eb11aSdp78419 } 284*ce8eb11aSdp78419 285*ce8eb11aSdp78419 /* Global enable for mpo */ 286*ce8eb11aSdp78419 if (sun4v_mpo_enable == 0) { 287*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 288*ce8eb11aSdp78419 ret_val = -1; 289*ce8eb11aSdp78419 goto fail; 290*ce8eb11aSdp78419 } 291*ce8eb11aSdp78419 292*ce8eb11aSdp78419 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 293*ce8eb11aSdp78419 "fwd", &lgrpnodes); 294*ce8eb11aSdp78419 295*ce8eb11aSdp78419 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 296*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 297*ce8eb11aSdp78419 ret_val = -1; 298*ce8eb11aSdp78419 goto fail; 299*ce8eb11aSdp78419 } 300*ce8eb11aSdp78419 301*ce8eb11aSdp78419 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 302*ce8eb11aSdp78419 303*ce8eb11aSdp78419 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 304*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 305*ce8eb11aSdp78419 "in MD\n"); 306*ce8eb11aSdp78419 ret_val = -1; 307*ce8eb11aSdp78419 goto fail; 308*ce8eb11aSdp78419 } 309*ce8eb11aSdp78419 310*ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 311*ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 312*ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 313*ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 314*ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 315*ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 316*ce8eb11aSdp78419 317*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 318*ce8eb11aSdp78419 mpo_lgroup[i].node = lgrpnodes[i]; 319*ce8eb11aSdp78419 mpo_lgroup[i].id = i; 320*ce8eb11aSdp78419 mpo_lgroup[i].ncpu = 0; 321*ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 322*ce8eb11aSdp78419 &mpo_lgroup[i].addr_mask); 323*ce8eb11aSdp78419 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 324*ce8eb11aSdp78419 &mpo_lgroup[i].addr_match); 325*ce8eb11aSdp78419 326*ce8eb11aSdp78419 /* 327*ce8eb11aSdp78419 * If either the mask or match properties are missing, set to 0 328*ce8eb11aSdp78419 */ 329*ce8eb11aSdp78419 if (result < 0) { 330*ce8eb11aSdp78419 mpo_lgroup[i].addr_mask = 0; 331*ce8eb11aSdp78419 mpo_lgroup[i].addr_match = 0; 332*ce8eb11aSdp78419 } 333*ce8eb11aSdp78419 334*ce8eb11aSdp78419 /* Set latency to 0 if property not present */ 335*ce8eb11aSdp78419 336*ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 337*ce8eb11aSdp78419 &mpo_lgroup[i].latency); 338*ce8eb11aSdp78419 if (result < 0) 339*ce8eb11aSdp78419 mpo_lgroup[i].latency = 0; 340*ce8eb11aSdp78419 } 341*ce8eb11aSdp78419 342*ce8eb11aSdp78419 /* 343*ce8eb11aSdp78419 * Sub-page level interleave is not yet supported. Check for it, 344*ce8eb11aSdp78419 * and remove sub-page interleaved lgroups from mpo_lgroup and 345*ce8eb11aSdp78419 * n_lgrpnodes. If no lgroups are left, return. 346*ce8eb11aSdp78419 */ 347*ce8eb11aSdp78419 348*ce8eb11aSdp78419 sub_page_fix = fix_interleave(); 349*ce8eb11aSdp78419 if (n_lgrpnodes == 0) { 350*ce8eb11aSdp78419 ret_val = -1; 351*ce8eb11aSdp78419 goto fail; 352*ce8eb11aSdp78419 } 353*ce8eb11aSdp78419 354*ce8eb11aSdp78419 /* Ensure that all of the addr_mask values are the same */ 355*ce8eb11aSdp78419 356*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 357*ce8eb11aSdp78419 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 358*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 359*ce8eb11aSdp78419 "addr_mask values are not the same\n"); 360*ce8eb11aSdp78419 ret_val = -1; 361*ce8eb11aSdp78419 goto fail; 362*ce8eb11aSdp78419 } 363*ce8eb11aSdp78419 } 364*ce8eb11aSdp78419 365*ce8eb11aSdp78419 /* 366*ce8eb11aSdp78419 * Ensure that all lgrp nodes see all the mblocks. However, if 367*ce8eb11aSdp78419 * sub-page interleave is being fixed, they do not, so skip 368*ce8eb11aSdp78419 * the check. 369*ce8eb11aSdp78419 */ 370*ce8eb11aSdp78419 371*ce8eb11aSdp78419 if (sub_page_fix == 0) { 372*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 373*ce8eb11aSdp78419 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 374*ce8eb11aSdp78419 PROP_LG_MBLOCK, "fwd", &nodes); 375*ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 376*ce8eb11aSdp78419 if (j != n_mblocks) { 377*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 378*ce8eb11aSdp78419 "sub-page interleave is being fixed\n"); 379*ce8eb11aSdp78419 ret_val = -1; 380*ce8eb11aSdp78419 goto fail; 381*ce8eb11aSdp78419 } 382*ce8eb11aSdp78419 } 383*ce8eb11aSdp78419 } 384*ce8eb11aSdp78419 385*ce8eb11aSdp78419 /* 386*ce8eb11aSdp78419 * Use the address mask from the first lgroup node 387*ce8eb11aSdp78419 * to establish our home_mask. 388*ce8eb11aSdp78419 */ 389*ce8eb11aSdp78419 home_mask = mpo_lgroup[0].addr_mask; 390*ce8eb11aSdp78419 home_mask_pfn = btop(home_mask); 391*ce8eb11aSdp78419 home_mask_shift = lowbit(home_mask) - 1; 392*ce8eb11aSdp78419 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 393*ce8eb11aSdp78419 mnode_pages = btop(1ULL << home_mask_shift); 394*ce8eb11aSdp78419 395*ce8eb11aSdp78419 /* 396*ce8eb11aSdp78419 * How many values are possible in home mask? Assume the mask 397*ce8eb11aSdp78419 * bits are contiguous. 398*ce8eb11aSdp78419 */ 399*ce8eb11aSdp78419 max_locality_groups = 400*ce8eb11aSdp78419 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 401*ce8eb11aSdp78419 402*ce8eb11aSdp78419 /* Now verify the home mask bits are contiguous */ 403*ce8eb11aSdp78419 404*ce8eb11aSdp78419 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 405*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 406*ce8eb11aSdp78419 "home mask bits are not contiguous\n"); 407*ce8eb11aSdp78419 ret_val = -1; 408*ce8eb11aSdp78419 goto fail; 409*ce8eb11aSdp78419 } 410*ce8eb11aSdp78419 411*ce8eb11aSdp78419 /* Record all of the home bits */ 412*ce8eb11aSdp78419 413*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 414*ce8eb11aSdp78419 HOMESET_ADD(mem_lg_homeset, 415*ce8eb11aSdp78419 mpo_lgroup[i].addr_match >> home_mask_shift); 416*ce8eb11aSdp78419 } 417*ce8eb11aSdp78419 418*ce8eb11aSdp78419 /* Count the number different "home" mem_lg's we've discovered */ 419*ce8eb11aSdp78419 420*ce8eb11aSdp78419 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 421*ce8eb11aSdp78419 422*ce8eb11aSdp78419 /* If we have only 1 locality group then we can exit */ 423*ce8eb11aSdp78419 if (n_locality_groups == 1) { 424*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 425*ce8eb11aSdp78419 ret_val = -1; 426*ce8eb11aSdp78419 goto fail; 427*ce8eb11aSdp78419 } 428*ce8eb11aSdp78419 429*ce8eb11aSdp78419 /* 430*ce8eb11aSdp78419 * Set the latencies. A CPU's lgroup is defined by the lowest 431*ce8eb11aSdp78419 * latency found. All other memory is considered remote, and the 432*ce8eb11aSdp78419 * remote latency is represented by the highest latency found. 433*ce8eb11aSdp78419 * Thus hierarchical lgroups, if any, are approximated by a 434*ce8eb11aSdp78419 * two level scheme. 435*ce8eb11aSdp78419 * 436*ce8eb11aSdp78419 * The Solaris MPO framework by convention wants to see latencies 437*ce8eb11aSdp78419 * in units of nano-sec/10. In the MD, the units are defined to be 438*ce8eb11aSdp78419 * pico-seconds. 439*ce8eb11aSdp78419 */ 440*ce8eb11aSdp78419 441*ce8eb11aSdp78419 lower_latency = mpo_lgroup[0].latency; 442*ce8eb11aSdp78419 higher_latency = mpo_lgroup[0].latency; 443*ce8eb11aSdp78419 444*ce8eb11aSdp78419 for (i = 1; i < n_lgrpnodes; i++) { 445*ce8eb11aSdp78419 if (mpo_lgroup[i].latency < lower_latency) { 446*ce8eb11aSdp78419 lower_latency = mpo_lgroup[i].latency; 447*ce8eb11aSdp78419 } 448*ce8eb11aSdp78419 if (mpo_lgroup[i].latency > higher_latency) { 449*ce8eb11aSdp78419 higher_latency = mpo_lgroup[i].latency; 450*ce8eb11aSdp78419 } 451*ce8eb11aSdp78419 } 452*ce8eb11aSdp78419 lower_latency /= 10000; 453*ce8eb11aSdp78419 higher_latency /= 10000; 454*ce8eb11aSdp78419 455*ce8eb11aSdp78419 /* Clear our CPU data */ 456*ce8eb11aSdp78419 457*ce8eb11aSdp78419 for (i = 0; i < NCPU; i++) { 458*ce8eb11aSdp78419 mpo_cpu[i].home = 0; 459*ce8eb11aSdp78419 mpo_cpu[i].latency = (uint_t)(-1); 460*ce8eb11aSdp78419 } 461*ce8eb11aSdp78419 462*ce8eb11aSdp78419 /* Build the CPU nodes */ 463*ce8eb11aSdp78419 for (i = 0; i < n_cpunodes; i++) { 464*ce8eb11aSdp78419 465*ce8eb11aSdp78419 /* Read in the lgroup nodes */ 466*ce8eb11aSdp78419 467*ce8eb11aSdp78419 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 468*ce8eb11aSdp78419 if (result < 0) { 469*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 470*ce8eb11aSdp78419 ret_val = -1; 471*ce8eb11aSdp78419 goto fail; 472*ce8eb11aSdp78419 } 473*ce8eb11aSdp78419 474*ce8eb11aSdp78419 n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, 475*ce8eb11aSdp78419 "fwd", &nodes); 476*ce8eb11aSdp78419 if (n_lgroups <= 0) { 477*ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); 478*ce8eb11aSdp78419 ret_val = -1; 479*ce8eb11aSdp78419 goto fail; 480*ce8eb11aSdp78419 } 481*ce8eb11aSdp78419 482*ce8eb11aSdp78419 /* 483*ce8eb11aSdp78419 * Find the lgroup this cpu belongs to with the lowest latency. 484*ce8eb11aSdp78419 * Check all the lgrp nodes connected to this CPU to determine 485*ce8eb11aSdp78419 * which has the smallest latency. 486*ce8eb11aSdp78419 */ 487*ce8eb11aSdp78419 488*ce8eb11aSdp78419 for (j = 0; j < n_lgroups; j++) { 489*ce8eb11aSdp78419 for (o = 0; o < n_lgrpnodes; o++) { 490*ce8eb11aSdp78419 if (nodes[j] == mpo_lgroup[o].node) { 491*ce8eb11aSdp78419 if (mpo_lgroup[o].latency < 492*ce8eb11aSdp78419 mpo_cpu[k].latency) { 493*ce8eb11aSdp78419 mpo_cpu[k].home = 494*ce8eb11aSdp78419 mpo_lgroup[o].addr_match 495*ce8eb11aSdp78419 >> home_mask_shift; 496*ce8eb11aSdp78419 mpo_cpu[k].latency = 497*ce8eb11aSdp78419 mpo_lgroup[o].latency; 498*ce8eb11aSdp78419 mpo_lgroup[o].ncpu++; 499*ce8eb11aSdp78419 } 500*ce8eb11aSdp78419 } 501*ce8eb11aSdp78419 } 502*ce8eb11aSdp78419 } 503*ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 504*ce8eb11aSdp78419 } 505*ce8eb11aSdp78419 506*ce8eb11aSdp78419 /* Validate that no large pages cross mnode boundaries. */ 507*ce8eb11aSdp78419 if (valid_pages(md, cpunodes[0]) == 0) { 508*ce8eb11aSdp78419 ret_val = -1; 509*ce8eb11aSdp78419 goto fail; 510*ce8eb11aSdp78419 } 511*ce8eb11aSdp78419 512*ce8eb11aSdp78419 fail: 513*ce8eb11aSdp78419 /* MD cookies are no longer valid; ensure they are not used again. */ 514*ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) 515*ce8eb11aSdp78419 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 516*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) 517*ce8eb11aSdp78419 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 518*ce8eb11aSdp78419 519*ce8eb11aSdp78419 if (n_cpunodes > 0) 520*ce8eb11aSdp78419 md_free_scan_dag(md, &cpunodes); 521*ce8eb11aSdp78419 if (n_lgrpnodes > 0) 522*ce8eb11aSdp78419 md_free_scan_dag(md, &lgrpnodes); 523*ce8eb11aSdp78419 if (n_mblocks > 0) 524*ce8eb11aSdp78419 md_free_scan_dag(md, &mblocknodes); 525*ce8eb11aSdp78419 else 526*ce8eb11aSdp78419 panic("lgrp_traverse: No memory blocks found"); 527*ce8eb11aSdp78419 528*ce8eb11aSdp78419 if (ret_val == 0) 529*ce8eb11aSdp78419 MPO_STATUS("MPO feature is enabled.\n"); 530*ce8eb11aSdp78419 531*ce8eb11aSdp78419 return (ret_val); 532*ce8eb11aSdp78419 } 533*ce8eb11aSdp78419 534*ce8eb11aSdp78419 /* 535*ce8eb11aSdp78419 * Determine the number of unique mem_lg's present in our system 536*ce8eb11aSdp78419 */ 537*ce8eb11aSdp78419 static int 538*ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 539*ce8eb11aSdp78419 { 540*ce8eb11aSdp78419 int homeid; 541*ce8eb11aSdp78419 int count = 0; 542*ce8eb11aSdp78419 543*ce8eb11aSdp78419 /* 544*ce8eb11aSdp78419 * Scan the "home" bits of the mem_lgs, count 545*ce8eb11aSdp78419 * the number that are unique. 546*ce8eb11aSdp78419 */ 547*ce8eb11aSdp78419 548*ce8eb11aSdp78419 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 549*ce8eb11aSdp78419 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 550*ce8eb11aSdp78419 count++; 551*ce8eb11aSdp78419 } 552*ce8eb11aSdp78419 } 553*ce8eb11aSdp78419 554*ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 555*ce8eb11aSdp78419 mem_lg_homeset); 556*ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 557*ce8eb11aSdp78419 558*ce8eb11aSdp78419 /* Default must be at least one */ 559*ce8eb11aSdp78419 if (count == 0) 560*ce8eb11aSdp78419 count = 1; 561*ce8eb11aSdp78419 562*ce8eb11aSdp78419 return (count); 563*ce8eb11aSdp78419 } 564*ce8eb11aSdp78419 565*ce8eb11aSdp78419 /* 566*ce8eb11aSdp78419 * Platform specific lgroup initialization 567*ce8eb11aSdp78419 */ 568*ce8eb11aSdp78419 void 569*ce8eb11aSdp78419 plat_lgrp_init(void) 570*ce8eb11aSdp78419 { 571*ce8eb11aSdp78419 md_t *md; 572*ce8eb11aSdp78419 int i, rc, ncpu_min; 573*ce8eb11aSdp78419 574*ce8eb11aSdp78419 /* Get the Machine Descriptor handle */ 575*ce8eb11aSdp78419 576*ce8eb11aSdp78419 md = md_get_handle(); 577*ce8eb11aSdp78419 578*ce8eb11aSdp78419 /* If not, we cannot continue */ 579*ce8eb11aSdp78419 580*ce8eb11aSdp78419 if (md == NULL) { 581*ce8eb11aSdp78419 panic("cannot access machine descriptor\n"); 582*ce8eb11aSdp78419 } else { 583*ce8eb11aSdp78419 rc = lgrp_traverse(md); 584*ce8eb11aSdp78419 (void) md_fini_handle(md); 585*ce8eb11aSdp78419 } 586*ce8eb11aSdp78419 587*ce8eb11aSdp78419 /* 588*ce8eb11aSdp78419 * If we can't process the MD for lgroups then at least let the 589*ce8eb11aSdp78419 * system try to boot. Assume we have one lgroup so that 590*ce8eb11aSdp78419 * when plat_build_mem_nodes is called, it will attempt to init 591*ce8eb11aSdp78419 * an mnode based on the supplied memory segment. 592*ce8eb11aSdp78419 */ 593*ce8eb11aSdp78419 594*ce8eb11aSdp78419 if (rc == -1) { 595*ce8eb11aSdp78419 home_mask_pfn = 0; 596*ce8eb11aSdp78419 max_locality_groups = 1; 597*ce8eb11aSdp78419 n_locality_groups = 1; 598*ce8eb11aSdp78419 return; 599*ce8eb11aSdp78419 } 600*ce8eb11aSdp78419 601*ce8eb11aSdp78419 mem_node_pfn_shift = 0; 602*ce8eb11aSdp78419 mem_node_physalign = 0; 603*ce8eb11aSdp78419 604*ce8eb11aSdp78419 /* Use lgroup-aware TSB allocations */ 605*ce8eb11aSdp78419 tsb_lgrp_affinity = 1; 606*ce8eb11aSdp78419 607*ce8eb11aSdp78419 /* 608*ce8eb11aSdp78419 * lgrp_expand_proc_thresh is the minimum load on the lgroups 609*ce8eb11aSdp78419 * this process is currently running on before considering 610*ce8eb11aSdp78419 * expanding threads to another lgroup. 611*ce8eb11aSdp78419 * 612*ce8eb11aSdp78419 * lgrp_expand_proc_diff determines how much less the remote lgroup 613*ce8eb11aSdp78419 * must be loaded before expanding to it. 614*ce8eb11aSdp78419 * 615*ce8eb11aSdp78419 * On sun4v CMT processors, threads share a core pipeline, and 616*ce8eb11aSdp78419 * at less than 100% utilization, best throughput is obtained by 617*ce8eb11aSdp78419 * spreading threads across more cores, even if some are in a 618*ce8eb11aSdp78419 * different lgroup. Spread threads to a new lgroup if the 619*ce8eb11aSdp78419 * current group is more than 50% loaded. Because of virtualization, 620*ce8eb11aSdp78419 * lgroups may have different numbers of CPUs, but the tunables 621*ce8eb11aSdp78419 * apply to all lgroups, so find the smallest lgroup and compute 622*ce8eb11aSdp78419 * 50% loading. 623*ce8eb11aSdp78419 */ 624*ce8eb11aSdp78419 625*ce8eb11aSdp78419 ncpu_min = NCPU; 626*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 627*ce8eb11aSdp78419 int ncpu = mpo_lgroup[i].ncpu; 628*ce8eb11aSdp78419 if (ncpu != 0 && ncpu < ncpu_min) 629*ce8eb11aSdp78419 ncpu_min = ncpu; 630*ce8eb11aSdp78419 } 631*ce8eb11aSdp78419 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 632*ce8eb11aSdp78419 633*ce8eb11aSdp78419 /* new home may only be half as loaded as the existing home to use it */ 634*ce8eb11aSdp78419 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 635*ce8eb11aSdp78419 636*ce8eb11aSdp78419 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 637*ce8eb11aSdp78419 638*ce8eb11aSdp78419 /* Require that a home lgroup have some memory to be chosen */ 639*ce8eb11aSdp78419 lgrp_mem_free_thresh = 1; 640*ce8eb11aSdp78419 641*ce8eb11aSdp78419 /* Standard home-on-next-touch policy */ 642*ce8eb11aSdp78419 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 643*ce8eb11aSdp78419 644*ce8eb11aSdp78419 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 645*ce8eb11aSdp78419 lgrp_load_thresh = UINT32_MAX; 646*ce8eb11aSdp78419 } 647*ce8eb11aSdp78419 648*ce8eb11aSdp78419 /* 649*ce8eb11aSdp78419 * Helper routine for debugging calls to mem_node_add_slice() 650*ce8eb11aSdp78419 */ 651*ce8eb11aSdp78419 static void 652*ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 653*ce8eb11aSdp78419 { 654*ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 655*ce8eb11aSdp78419 static int slice_count = 0; 656*ce8eb11aSdp78419 657*ce8eb11aSdp78419 slice_count++; 658*ce8eb11aSdp78419 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 659*ce8eb11aSdp78419 slice_count, basepfn, endpfn); 660*ce8eb11aSdp78419 #endif 661*ce8eb11aSdp78419 mem_node_add_slice(basepfn, endpfn); 662*ce8eb11aSdp78419 } 663*ce8eb11aSdp78419 664*ce8eb11aSdp78419 /* 665*ce8eb11aSdp78419 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 666*ce8eb11aSdp78419 */ 667*ce8eb11aSdp78419 static void 668*ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 669*ce8eb11aSdp78419 { 670*ce8eb11aSdp78419 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 671*ce8eb11aSdp78419 "mnode index: %d\n", plathand, mnode); 672*ce8eb11aSdp78419 plat_assign_lgrphand_to_mem_node(plathand, mnode); 673*ce8eb11aSdp78419 } 674*ce8eb11aSdp78419 675*ce8eb11aSdp78419 /* 676*ce8eb11aSdp78419 * plat_build_mem_nodes() 677*ce8eb11aSdp78419 * 678*ce8eb11aSdp78419 * Define the mem_nodes based on the modified boot memory list, 679*ce8eb11aSdp78419 * or based on info read from the MD in plat_lgrp_init(). 680*ce8eb11aSdp78419 * 681*ce8eb11aSdp78419 * When the home mask lies in the middle of the address bits (as it does on 682*ce8eb11aSdp78419 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 683*ce8eb11aSdp78419 * it is striped across an mblock in a repeating pattern of contiguous memory 684*ce8eb11aSdp78419 * followed by a gap. The stripe width is the size of the contiguous piece. 685*ce8eb11aSdp78419 * The stride is the distance from the start of one contiguous piece to the 686*ce8eb11aSdp78419 * start of the next. The gap is thus stride - stripe_width. 687*ce8eb11aSdp78419 * 688*ce8eb11aSdp78419 * The stripe of an mnode that falls within an mblock is described by the type 689*ce8eb11aSdp78419 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 690*ce8eb11aSdp78419 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 691*ce8eb11aSdp78419 * this array is predetermined. The mem_stripe_t that describes mnode m 692*ce8eb11aSdp78419 * within mpo_mblock[i] is stored at 693*ce8eb11aSdp78419 * mem_stripes[ m + i * max_locality_groups ] 694*ce8eb11aSdp78419 * 695*ce8eb11aSdp78419 * max_locality_groups is the total number of possible locality groups, 696*ce8eb11aSdp78419 * as defined by the size of the home mask, even if the memory assigned 697*ce8eb11aSdp78419 * to the domain is small and does not cover all the lgroups. Thus some 698*ce8eb11aSdp78419 * mem_stripe_t's may be empty. 699*ce8eb11aSdp78419 * 700*ce8eb11aSdp78419 * The members of mem_stripe_t are: 701*ce8eb11aSdp78419 * physbase: First valid page in mem_node in the corresponding mblock 702*ce8eb11aSdp78419 * physmax: Last valid page in mem_node in mblock 703*ce8eb11aSdp78419 * offset: The full stripe width starts at physbase - offset. 704*ce8eb11aSdp78419 * Thus if offset is non-zero, this mem_node starts in the middle 705*ce8eb11aSdp78419 * of a stripe width, and the second full stripe starts at 706*ce8eb11aSdp78419 * physbase - offset + stride. (even though physmax may fall in the 707*ce8eb11aSdp78419 * middle of a stripe width, we do not save the ending fragment size 708*ce8eb11aSdp78419 * in this data structure.) 709*ce8eb11aSdp78419 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 710*ce8eb11aSdp78419 * 711*ce8eb11aSdp78419 * The stripe width is kept in the global mnode_pages. 712*ce8eb11aSdp78419 * The stride is kept in the global mnode_stride. 713*ce8eb11aSdp78419 * All the above use pfn's as the unit. 714*ce8eb11aSdp78419 * 715*ce8eb11aSdp78419 * As an example, the memory layout for a domain with 2 mblocks and 4 716*ce8eb11aSdp78419 * mem_nodes 0,1,2,3 could look like this: 717*ce8eb11aSdp78419 * 718*ce8eb11aSdp78419 * 123012301230 ... 012301230123 ... 719*ce8eb11aSdp78419 * mblock 0 mblock 1 720*ce8eb11aSdp78419 */ 721*ce8eb11aSdp78419 722*ce8eb11aSdp78419 void 723*ce8eb11aSdp78419 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 724*ce8eb11aSdp78419 { 725*ce8eb11aSdp78419 lgrp_handle_t lgrphand, lgrp_start; 726*ce8eb11aSdp78419 int i, mnode, elem; 727*ce8eb11aSdp78419 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 728*ce8eb11aSdp78419 uint64_t stripe, frag, remove; 729*ce8eb11aSdp78419 mem_stripe_t *ms; 730*ce8eb11aSdp78419 731*ce8eb11aSdp78419 /* Check for non-MPO sun4v platforms */ 732*ce8eb11aSdp78419 733*ce8eb11aSdp78419 if (n_locality_groups <= 1) { 734*ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0); 735*ce8eb11aSdp78419 for (elem = 0; elem < nelems; elem += 2) { 736*ce8eb11aSdp78419 base = list[elem]; 737*ce8eb11aSdp78419 len = list[elem+1]; 738*ce8eb11aSdp78419 739*ce8eb11aSdp78419 mpo_mem_node_add_slice(btop(base), 740*ce8eb11aSdp78419 btop(base + len - 1)); 741*ce8eb11aSdp78419 } 742*ce8eb11aSdp78419 mem_node_pfn_shift = 0; 743*ce8eb11aSdp78419 mem_node_physalign = 0; 744*ce8eb11aSdp78419 n_mem_stripes = 0; 745*ce8eb11aSdp78419 return; 746*ce8eb11aSdp78419 } 747*ce8eb11aSdp78419 748*ce8eb11aSdp78419 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 749*ce8eb11aSdp78419 max_mem_nodes = max_locality_groups; 750*ce8eb11aSdp78419 bzero(mem_stripes, sizeof (mem_stripes)); 751*ce8eb11aSdp78419 stripe = ptob(mnode_pages); 752*ce8eb11aSdp78419 stride = max_locality_groups * stripe; 753*ce8eb11aSdp78419 754*ce8eb11aSdp78419 /* Save commonly used values in globals */ 755*ce8eb11aSdp78419 mnode_stride = btop(stride); 756*ce8eb11aSdp78419 n_mem_stripes = max_locality_groups * n_mblocks; 757*ce8eb11aSdp78419 stripe_shift = highbit(max_locality_groups) - 1; 758*ce8eb11aSdp78419 759*ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 760*ce8eb11aSdp78419 761*ce8eb11aSdp78419 base = mpo_mblock[i].base; 762*ce8eb11aSdp78419 end = mpo_mblock[i].base + mpo_mblock[i].size; 763*ce8eb11aSdp78419 ra_to_pa = mpo_mblock[i].ra_to_pa; 764*ce8eb11aSdp78419 mpo_mblock[i].base_pfn = btop(base); 765*ce8eb11aSdp78419 mpo_mblock[i].end_pfn = btop(end - 1); 766*ce8eb11aSdp78419 767*ce8eb11aSdp78419 /* Find the offset from the prev stripe boundary in PA space. */ 768*ce8eb11aSdp78419 offset = (base + ra_to_pa) & (stripe - 1); 769*ce8eb11aSdp78419 770*ce8eb11aSdp78419 /* Set the next stripe boundary. */ 771*ce8eb11aSdp78419 stripe_end = base - offset + stripe; 772*ce8eb11aSdp78419 773*ce8eb11aSdp78419 lgrp_start = (((base + ra_to_pa) & home_mask) >> 774*ce8eb11aSdp78419 home_mask_shift); 775*ce8eb11aSdp78419 lgrphand = lgrp_start; 776*ce8eb11aSdp78419 777*ce8eb11aSdp78419 /* 778*ce8eb11aSdp78419 * Loop over all lgroups covered by the mblock, creating a 779*ce8eb11aSdp78419 * stripe for each. Stop when lgrp_start is visited again. 780*ce8eb11aSdp78419 */ 781*ce8eb11aSdp78419 do { 782*ce8eb11aSdp78419 /* mblock may not span all lgroups */ 783*ce8eb11aSdp78419 if (base >= end) 784*ce8eb11aSdp78419 break; 785*ce8eb11aSdp78419 786*ce8eb11aSdp78419 mnode = lgrphand; 787*ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 788*ce8eb11aSdp78419 789*ce8eb11aSdp78419 /* 790*ce8eb11aSdp78419 * Calculate the size of the fragment that does not 791*ce8eb11aSdp78419 * belong to the mnode in the last partial stride. 792*ce8eb11aSdp78419 */ 793*ce8eb11aSdp78419 frag = (end - (base - offset)) & (stride - 1); 794*ce8eb11aSdp78419 if (frag == 0) { 795*ce8eb11aSdp78419 /* remove the gap */ 796*ce8eb11aSdp78419 remove = stride - stripe; 797*ce8eb11aSdp78419 } else if (frag < stripe) { 798*ce8eb11aSdp78419 /* fragment fits in stripe; keep it all */ 799*ce8eb11aSdp78419 remove = 0; 800*ce8eb11aSdp78419 } else { 801*ce8eb11aSdp78419 /* fragment is large; trim after whole stripe */ 802*ce8eb11aSdp78419 remove = frag - stripe; 803*ce8eb11aSdp78419 } 804*ce8eb11aSdp78419 805*ce8eb11aSdp78419 ms = &mem_stripes[i * max_locality_groups + mnode]; 806*ce8eb11aSdp78419 ms->physbase = btop(base); 807*ce8eb11aSdp78419 ms->physmax = btop(end - 1 - remove); 808*ce8eb11aSdp78419 ms->offset = btop(offset); 809*ce8eb11aSdp78419 ms->exists = 1; 810*ce8eb11aSdp78419 811*ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode); 812*ce8eb11aSdp78419 mpo_mem_node_add_slice(ms->physbase, ms->physmax); 813*ce8eb11aSdp78419 814*ce8eb11aSdp78419 base = stripe_end; 815*ce8eb11aSdp78419 stripe_end += stripe; 816*ce8eb11aSdp78419 offset = 0; 817*ce8eb11aSdp78419 lgrphand = (((base + ra_to_pa) & home_mask) >> 818*ce8eb11aSdp78419 home_mask_shift); 819*ce8eb11aSdp78419 } while (lgrphand != lgrp_start); 820*ce8eb11aSdp78419 } 821*ce8eb11aSdp78419 822*ce8eb11aSdp78419 /* 823*ce8eb11aSdp78419 * Indicate to vm_pagelist that the hpm_counters array 824*ce8eb11aSdp78419 * should be shared because the ranges overlap. 825*ce8eb11aSdp78419 */ 826*ce8eb11aSdp78419 if (max_mem_nodes > 1) { 827*ce8eb11aSdp78419 interleaved_mnodes = 1; 828*ce8eb11aSdp78419 } 829*ce8eb11aSdp78419 } 830*ce8eb11aSdp78419 831*ce8eb11aSdp78419 /* 832*ce8eb11aSdp78419 * Return the locality group value for the supplied processor 833*ce8eb11aSdp78419 */ 834*ce8eb11aSdp78419 lgrp_handle_t 835*ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id) 836*ce8eb11aSdp78419 { 837*ce8eb11aSdp78419 if (n_locality_groups > 1) { 838*ce8eb11aSdp78419 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 839*ce8eb11aSdp78419 } else { 840*ce8eb11aSdp78419 return ((lgrp_handle_t)0); /* Default */ 841*ce8eb11aSdp78419 } 842*ce8eb11aSdp78419 } 843*ce8eb11aSdp78419 844*ce8eb11aSdp78419 int 845*ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 846*ce8eb11aSdp78419 { 847*ce8eb11aSdp78419 /* 848*ce8eb11aSdp78419 * Return min remote latency when there are more than two lgroups 849*ce8eb11aSdp78419 * (root and child) and getting latency between two different lgroups 850*ce8eb11aSdp78419 * or root is involved. 851*ce8eb11aSdp78419 */ 852*ce8eb11aSdp78419 if (lgrp_optimizations() && (from != to || 853*ce8eb11aSdp78419 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 854*ce8eb11aSdp78419 return ((int)higher_latency); 855*ce8eb11aSdp78419 } else { 856*ce8eb11aSdp78419 return ((int)lower_latency); 857*ce8eb11aSdp78419 } 858*ce8eb11aSdp78419 } 859*ce8eb11aSdp78419 860*ce8eb11aSdp78419 int 861*ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn) 862*ce8eb11aSdp78419 { 863*ce8eb11aSdp78419 int i, mnode; 864*ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 865*ce8eb11aSdp78419 struct mblock_md *mb; 866*ce8eb11aSdp78419 867*ce8eb11aSdp78419 if (n_locality_groups <= 1) 868*ce8eb11aSdp78419 return (0); 869*ce8eb11aSdp78419 870*ce8eb11aSdp78419 /* 871*ce8eb11aSdp78419 * The mnode is defined to be 1:1 with the lgroup handle, which 872*ce8eb11aSdp78419 * is taken from from the home bits. Find the mblock in which 873*ce8eb11aSdp78419 * the pfn falls to get the ra_to_pa adjustment, and extract 874*ce8eb11aSdp78419 * the home bits. 875*ce8eb11aSdp78419 */ 876*ce8eb11aSdp78419 mb = &mpo_mblock[0]; 877*ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 878*ce8eb11aSdp78419 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 879*ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 880*ce8eb11aSdp78419 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 881*ce8eb11aSdp78419 home_mask_pfn_shift); 882*ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 883*ce8eb11aSdp78419 return (mnode); 884*ce8eb11aSdp78419 } 885*ce8eb11aSdp78419 mb++; 886*ce8eb11aSdp78419 } 887*ce8eb11aSdp78419 888*ce8eb11aSdp78419 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 889*ce8eb11aSdp78419 return (pfn); 890*ce8eb11aSdp78419 } 891*ce8eb11aSdp78419 892*ce8eb11aSdp78419 /* 893*ce8eb11aSdp78419 * plat_rapfn_to_papfn 894*ce8eb11aSdp78419 * 895*ce8eb11aSdp78419 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 896*ce8eb11aSdp78419 * and home mask bits are correct. The upper bits do not necessarily 897*ce8eb11aSdp78419 * match the actual PA, however. 898*ce8eb11aSdp78419 */ 899*ce8eb11aSdp78419 pfn_t 900*ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn) 901*ce8eb11aSdp78419 { 902*ce8eb11aSdp78419 int i; 903*ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 904*ce8eb11aSdp78419 struct mblock_md *mb; 905*ce8eb11aSdp78419 906*ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 907*ce8eb11aSdp78419 if (n_mblocks == 1) 908*ce8eb11aSdp78419 return (pfn + base_ra_to_pa_pfn); 909*ce8eb11aSdp78419 910*ce8eb11aSdp78419 /* 911*ce8eb11aSdp78419 * Find the mblock in which the pfn falls 912*ce8eb11aSdp78419 * in order to get the ra_to_pa adjustment. 913*ce8eb11aSdp78419 */ 914*ce8eb11aSdp78419 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 915*ce8eb11aSdp78419 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 916*ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 917*ce8eb11aSdp78419 return (pfn + ra_to_pa_pfn); 918*ce8eb11aSdp78419 } 919*ce8eb11aSdp78419 } 920*ce8eb11aSdp78419 921*ce8eb11aSdp78419 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 922*ce8eb11aSdp78419 return (pfn); 923*ce8eb11aSdp78419 } 924*ce8eb11aSdp78419 925*ce8eb11aSdp78419 /* 926*ce8eb11aSdp78419 * plat_mem_node_iterator_init() 927*ce8eb11aSdp78419 * Initialize cookie to iterate over pfn's in an mnode. There is 928*ce8eb11aSdp78419 * no additional iterator function. The caller uses the info from 929*ce8eb11aSdp78419 * the iterator structure directly. 930*ce8eb11aSdp78419 * 931*ce8eb11aSdp78419 * pfn: starting pfn. 932*ce8eb11aSdp78419 * mnode: desired mnode. 933*ce8eb11aSdp78419 * init: set to 1 for full init, 0 for continuation 934*ce8eb11aSdp78419 * 935*ce8eb11aSdp78419 * Returns the appropriate starting pfn for the iteration 936*ce8eb11aSdp78419 * the same as the input pfn if it falls in an mblock. 937*ce8eb11aSdp78419 * Returns the (pfn_t)-1 value if the input pfn lies past 938*ce8eb11aSdp78419 * the last valid mnode pfn. 939*ce8eb11aSdp78419 */ 940*ce8eb11aSdp78419 pfn_t 941*ce8eb11aSdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 942*ce8eb11aSdp78419 mem_node_iterator_t *it, int init) 943*ce8eb11aSdp78419 { 944*ce8eb11aSdp78419 int i; 945*ce8eb11aSdp78419 struct mblock_md *mblock; 946*ce8eb11aSdp78419 pfn_t base, end; 947*ce8eb11aSdp78419 948*ce8eb11aSdp78419 ASSERT(it != NULL); 949*ce8eb11aSdp78419 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 950*ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 951*ce8eb11aSdp78419 952*ce8eb11aSdp78419 if (init) { 953*ce8eb11aSdp78419 it->mi_last_mblock = 0; 954*ce8eb11aSdp78419 it->mi_init = 1; 955*ce8eb11aSdp78419 } 956*ce8eb11aSdp78419 957*ce8eb11aSdp78419 /* Check if mpo is not enabled and we only have one mblock */ 958*ce8eb11aSdp78419 if (n_locality_groups == 1 && n_mblocks == 1) { 959*ce8eb11aSdp78419 it->mi_mnode = mnode; 960*ce8eb11aSdp78419 it->mi_ra_to_pa = base_ra_to_pa_pfn; 961*ce8eb11aSdp78419 it->mi_mnode_pfn_mask = 0; 962*ce8eb11aSdp78419 it->mi_mnode_pfn_shift = 0; 963*ce8eb11aSdp78419 it->mi_mnode_mask = 0; 964*ce8eb11aSdp78419 it->mi_mblock_base = mem_node_config[mnode].physbase; 965*ce8eb11aSdp78419 it->mi_mblock_end = mem_node_config[mnode].physmax; 966*ce8eb11aSdp78419 if (pfn < it->mi_mblock_base) 967*ce8eb11aSdp78419 pfn = it->mi_mblock_base; 968*ce8eb11aSdp78419 else if (pfn > it->mi_mblock_end) 969*ce8eb11aSdp78419 pfn = (pfn_t)-1; 970*ce8eb11aSdp78419 return (pfn); 971*ce8eb11aSdp78419 } 972*ce8eb11aSdp78419 973*ce8eb11aSdp78419 /* 974*ce8eb11aSdp78419 * Find mblock that contains pfn, or first mblock after pfn, 975*ce8eb11aSdp78419 * else pfn is out of bounds, so use the last mblock. 976*ce8eb11aSdp78419 * mblocks are sorted in ascending address order. 977*ce8eb11aSdp78419 */ 978*ce8eb11aSdp78419 ASSERT(it->mi_last_mblock < n_mblocks); 979*ce8eb11aSdp78419 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 980*ce8eb11aSdp78419 i = init ? 0 : it->mi_last_mblock + 1; 981*ce8eb11aSdp78419 if (i == n_mblocks) 982*ce8eb11aSdp78419 return ((pfn_t)-1); 983*ce8eb11aSdp78419 984*ce8eb11aSdp78419 for (; i < n_mblocks; i++) { 985*ce8eb11aSdp78419 if (pfn <= mpo_mblock[i].end_pfn) 986*ce8eb11aSdp78419 break; 987*ce8eb11aSdp78419 } 988*ce8eb11aSdp78419 if (i == n_mblocks) { 989*ce8eb11aSdp78419 it->mi_last_mblock = i - 1; 990*ce8eb11aSdp78419 return ((pfn_t)-1); 991*ce8eb11aSdp78419 } 992*ce8eb11aSdp78419 it->mi_last_mblock = i; 993*ce8eb11aSdp78419 994*ce8eb11aSdp78419 /* 995*ce8eb11aSdp78419 * Memory stripes are defined if there is more than one locality 996*ce8eb11aSdp78419 * group, so use the stripe bounds. Otherwise use mblock bounds. 997*ce8eb11aSdp78419 */ 998*ce8eb11aSdp78419 mblock = &mpo_mblock[i]; 999*ce8eb11aSdp78419 if (n_mem_stripes > 0) { 1000*ce8eb11aSdp78419 mem_stripe_t *ms = 1001*ce8eb11aSdp78419 &mem_stripes[i * max_locality_groups + mnode]; 1002*ce8eb11aSdp78419 base = ms->physbase; 1003*ce8eb11aSdp78419 end = ms->physmax; 1004*ce8eb11aSdp78419 } else { 1005*ce8eb11aSdp78419 ASSERT(mnode == 0); 1006*ce8eb11aSdp78419 base = mblock->base_pfn; 1007*ce8eb11aSdp78419 end = mblock->end_pfn; 1008*ce8eb11aSdp78419 } 1009*ce8eb11aSdp78419 1010*ce8eb11aSdp78419 it->mi_mnode = mnode; 1011*ce8eb11aSdp78419 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1012*ce8eb11aSdp78419 it->mi_mblock_base = base; 1013*ce8eb11aSdp78419 it->mi_mblock_end = end; 1014*ce8eb11aSdp78419 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1015*ce8eb11aSdp78419 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1016*ce8eb11aSdp78419 it->mi_mnode_mask = max_locality_groups - 1; 1017*ce8eb11aSdp78419 if (pfn < base) 1018*ce8eb11aSdp78419 pfn = base; 1019*ce8eb11aSdp78419 else if (pfn > end) 1020*ce8eb11aSdp78419 pfn = (pfn_t)-1; 1021*ce8eb11aSdp78419 return (pfn); 1022*ce8eb11aSdp78419 } 1023*ce8eb11aSdp78419 1024*ce8eb11aSdp78419 /* 1025*ce8eb11aSdp78419 * plat_mem_node_intersect_range() 1026*ce8eb11aSdp78419 * 1027*ce8eb11aSdp78419 * Find the intersection between a memnode and a range of pfn's. 1028*ce8eb11aSdp78419 */ 1029*ce8eb11aSdp78419 void 1030*ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1031*ce8eb11aSdp78419 int mnode, pgcnt_t *npages_out) 1032*ce8eb11aSdp78419 { 1033*ce8eb11aSdp78419 pfn_t offset, len, hole, base, end, test_end, frag; 1034*ce8eb11aSdp78419 pfn_t nearest; 1035*ce8eb11aSdp78419 mem_stripe_t *ms; 1036*ce8eb11aSdp78419 int i, npages; 1037*ce8eb11aSdp78419 1038*ce8eb11aSdp78419 *npages_out = 0; 1039*ce8eb11aSdp78419 1040*ce8eb11aSdp78419 if (!mem_node_config[mnode].exists || test_len == 0) 1041*ce8eb11aSdp78419 return; 1042*ce8eb11aSdp78419 1043*ce8eb11aSdp78419 base = mem_node_config[mnode].physbase; 1044*ce8eb11aSdp78419 end = mem_node_config[mnode].physmax; 1045*ce8eb11aSdp78419 1046*ce8eb11aSdp78419 test_end = test_base + test_len - 1; 1047*ce8eb11aSdp78419 if (end < test_base || base > test_end) 1048*ce8eb11aSdp78419 return; 1049*ce8eb11aSdp78419 1050*ce8eb11aSdp78419 if (n_locality_groups == 1) { 1051*ce8eb11aSdp78419 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1052*ce8eb11aSdp78419 return; 1053*ce8eb11aSdp78419 } 1054*ce8eb11aSdp78419 1055*ce8eb11aSdp78419 hole = mnode_stride - mnode_pages; 1056*ce8eb11aSdp78419 npages = 0; 1057*ce8eb11aSdp78419 1058*ce8eb11aSdp78419 /* 1059*ce8eb11aSdp78419 * Iterate over all the stripes for this mnode (one per mblock), 1060*ce8eb11aSdp78419 * find the intersection with each, and accumulate the intersections. 1061*ce8eb11aSdp78419 * 1062*ce8eb11aSdp78419 * Determing the intersection with a stripe is tricky. If base or end 1063*ce8eb11aSdp78419 * fall outside the mem_node bounds, round them to physbase/physmax of 1064*ce8eb11aSdp78419 * mem_node. If base or end fall in a gap, round them to start of 1065*ce8eb11aSdp78419 * nearest stripe. If they fall within a stripe, keep base or end, 1066*ce8eb11aSdp78419 * but calculate the fragment size that should be excluded from the 1067*ce8eb11aSdp78419 * stripe. Calculate how many strides fall in the adjusted range, 1068*ce8eb11aSdp78419 * multiply by stripe width, and add the start and end fragments. 1069*ce8eb11aSdp78419 */ 1070*ce8eb11aSdp78419 1071*ce8eb11aSdp78419 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1072*ce8eb11aSdp78419 ms = &mem_stripes[i]; 1073*ce8eb11aSdp78419 if (ms->exists && 1074*ce8eb11aSdp78419 test_base <= (end = ms->physmax) && 1075*ce8eb11aSdp78419 test_end >= (base = ms->physbase)) { 1076*ce8eb11aSdp78419 1077*ce8eb11aSdp78419 offset = ms->offset; 1078*ce8eb11aSdp78419 1079*ce8eb11aSdp78419 if (test_base > base) { 1080*ce8eb11aSdp78419 /* Round test_base to next multiple of stride */ 1081*ce8eb11aSdp78419 len = P2ROUNDUP(test_base - (base - offset), 1082*ce8eb11aSdp78419 mnode_stride); 1083*ce8eb11aSdp78419 nearest = base - offset + len; 1084*ce8eb11aSdp78419 /* 1085*ce8eb11aSdp78419 * Compute distance from test_base to the 1086*ce8eb11aSdp78419 * stride boundary to see if test_base falls 1087*ce8eb11aSdp78419 * in the stripe or in the hole. 1088*ce8eb11aSdp78419 */ 1089*ce8eb11aSdp78419 if (nearest - test_base > hole) { 1090*ce8eb11aSdp78419 /* 1091*ce8eb11aSdp78419 * test_base lies in stripe, 1092*ce8eb11aSdp78419 * and offset should be excluded. 1093*ce8eb11aSdp78419 */ 1094*ce8eb11aSdp78419 offset = test_base - 1095*ce8eb11aSdp78419 (nearest - mnode_stride); 1096*ce8eb11aSdp78419 base = test_base; 1097*ce8eb11aSdp78419 } else { 1098*ce8eb11aSdp78419 /* round up to next stripe start */ 1099*ce8eb11aSdp78419 offset = 0; 1100*ce8eb11aSdp78419 base = nearest; 1101*ce8eb11aSdp78419 if (base > end) 1102*ce8eb11aSdp78419 continue; 1103*ce8eb11aSdp78419 } 1104*ce8eb11aSdp78419 1105*ce8eb11aSdp78419 } 1106*ce8eb11aSdp78419 1107*ce8eb11aSdp78419 if (test_end < end) 1108*ce8eb11aSdp78419 end = test_end; 1109*ce8eb11aSdp78419 end++; /* adjust to an exclusive bound */ 1110*ce8eb11aSdp78419 1111*ce8eb11aSdp78419 /* Round end to next multiple of stride */ 1112*ce8eb11aSdp78419 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1113*ce8eb11aSdp78419 nearest = (base - offset) + len; 1114*ce8eb11aSdp78419 if (nearest - end <= hole) { 1115*ce8eb11aSdp78419 /* end falls in hole, use entire last stripe */ 1116*ce8eb11aSdp78419 frag = 0; 1117*ce8eb11aSdp78419 } else { 1118*ce8eb11aSdp78419 /* end falls in stripe, compute fragment */ 1119*ce8eb11aSdp78419 frag = nearest - hole - end; 1120*ce8eb11aSdp78419 } 1121*ce8eb11aSdp78419 1122*ce8eb11aSdp78419 len = (len >> stripe_shift) - offset - frag; 1123*ce8eb11aSdp78419 npages += len; 1124*ce8eb11aSdp78419 } 1125*ce8eb11aSdp78419 } 1126*ce8eb11aSdp78419 1127*ce8eb11aSdp78419 *npages_out = npages; 1128*ce8eb11aSdp78419 } 1129*ce8eb11aSdp78419 1130*ce8eb11aSdp78419 /* 1131*ce8eb11aSdp78419 * valid_pages() 1132*ce8eb11aSdp78419 * 1133*ce8eb11aSdp78419 * Return 1 if pages are valid and do not cross mnode boundaries 1134*ce8eb11aSdp78419 * (which would break page free list assumptions), and 0 otherwise. 1135*ce8eb11aSdp78419 */ 1136*ce8eb11aSdp78419 1137*ce8eb11aSdp78419 #define MNODE(pa) \ 1138*ce8eb11aSdp78419 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1139*ce8eb11aSdp78419 1140*ce8eb11aSdp78419 static int 1141*ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0) 1142*ce8eb11aSdp78419 { 1143*ce8eb11aSdp78419 int i, max_szc; 1144*ce8eb11aSdp78419 uint64_t last_page_base, szc_mask; 1145*ce8eb11aSdp78419 uint64_t max_page_len, max_coalesce_len; 1146*ce8eb11aSdp78419 struct mblock_md *mb = mpo_mblock; 1147*ce8eb11aSdp78419 1148*ce8eb11aSdp78419 /* 1149*ce8eb11aSdp78419 * Find the smaller of the largest page possible and supported. 1150*ce8eb11aSdp78419 * mmu_exported_pagesize_mask is not yet initialized, so read 1151*ce8eb11aSdp78419 * it from the MD. Apply minimal fixups in case of broken MDs 1152*ce8eb11aSdp78419 * to get a sane mask. 1153*ce8eb11aSdp78419 */ 1154*ce8eb11aSdp78419 1155*ce8eb11aSdp78419 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1156*ce8eb11aSdp78419 szc_mask = 0; 1157*ce8eb11aSdp78419 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1158*ce8eb11aSdp78419 max_szc = highbit(szc_mask) - 1; 1159*ce8eb11aSdp78419 if (max_szc > TTE256M) 1160*ce8eb11aSdp78419 max_szc = TTE256M; 1161*ce8eb11aSdp78419 max_page_len = TTEBYTES(max_szc); 1162*ce8eb11aSdp78419 1163*ce8eb11aSdp78419 /* 1164*ce8eb11aSdp78419 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1165*ce8eb11aSdp78419 * if mmu-page-size-list does not contain it, so 256M pages must fall 1166*ce8eb11aSdp78419 * within one mnode to use MPO. 1167*ce8eb11aSdp78419 */ 1168*ce8eb11aSdp78419 max_coalesce_len = TTEBYTES(TTE256M); 1169*ce8eb11aSdp78419 ASSERT(max_coalesce_len >= max_page_len); 1170*ce8eb11aSdp78419 1171*ce8eb11aSdp78419 if (ptob(mnode_pages) < max_coalesce_len) { 1172*ce8eb11aSdp78419 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1173*ce8eb11aSdp78419 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1174*ce8eb11aSdp78419 return (0); 1175*ce8eb11aSdp78419 } 1176*ce8eb11aSdp78419 1177*ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1178*ce8eb11aSdp78419 uint64_t base = mb->base; 1179*ce8eb11aSdp78419 uint64_t end = mb->base + mb->size - 1; 1180*ce8eb11aSdp78419 uint64_t ra_to_pa = mb->ra_to_pa; 1181*ce8eb11aSdp78419 1182*ce8eb11aSdp78419 /* 1183*ce8eb11aSdp78419 * If mblock is smaller than the max page size, then 1184*ce8eb11aSdp78419 * RA = PA mod MAXPAGE is not guaranteed, but it must 1185*ce8eb11aSdp78419 * not span mnodes. 1186*ce8eb11aSdp78419 */ 1187*ce8eb11aSdp78419 if (mb->size < max_page_len) { 1188*ce8eb11aSdp78419 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1189*ce8eb11aSdp78419 MPO_STATUS("Small mblock spans mnodes; " 1190*ce8eb11aSdp78419 "MPO disabled: base = %lx, end = %lx, " 1191*ce8eb11aSdp78419 "ra2pa = %lx\n", base, end, ra_to_pa); 1192*ce8eb11aSdp78419 return (0); 1193*ce8eb11aSdp78419 } 1194*ce8eb11aSdp78419 } else { 1195*ce8eb11aSdp78419 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1196*ce8eb11aSdp78419 uint64_t pa_base = base + ra_to_pa; 1197*ce8eb11aSdp78419 if ((base & (max_coalesce_len - 1)) != 1198*ce8eb11aSdp78419 (pa_base & (max_coalesce_len - 1))) { 1199*ce8eb11aSdp78419 MPO_STATUS("bad page alignment; MPO disabled: " 1200*ce8eb11aSdp78419 "ra = %lx, pa = %lx, pagelen = %lx\n", 1201*ce8eb11aSdp78419 base, pa_base, max_coalesce_len); 1202*ce8eb11aSdp78419 return (0); 1203*ce8eb11aSdp78419 } 1204*ce8eb11aSdp78419 } 1205*ce8eb11aSdp78419 1206*ce8eb11aSdp78419 /* 1207*ce8eb11aSdp78419 * Find start of last large page in mblock in RA space. 1208*ce8eb11aSdp78419 * If page extends into the next mblock, verify the 1209*ce8eb11aSdp78419 * mnode does not change. 1210*ce8eb11aSdp78419 */ 1211*ce8eb11aSdp78419 last_page_base = P2ALIGN(end, max_coalesce_len); 1212*ce8eb11aSdp78419 if (i + 1 < n_mblocks && 1213*ce8eb11aSdp78419 last_page_base + max_coalesce_len > mb[1].base && 1214*ce8eb11aSdp78419 MNODE(last_page_base + ra_to_pa) != 1215*ce8eb11aSdp78419 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1216*ce8eb11aSdp78419 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1217*ce8eb11aSdp78419 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1218*ce8eb11aSdp78419 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1219*ce8eb11aSdp78419 mb[1].ra_to_pa, max_coalesce_len); 1220*ce8eb11aSdp78419 return (0); 1221*ce8eb11aSdp78419 } 1222*ce8eb11aSdp78419 1223*ce8eb11aSdp78419 mb++; 1224*ce8eb11aSdp78419 } 1225*ce8eb11aSdp78419 return (1); 1226*ce8eb11aSdp78419 } 1227*ce8eb11aSdp78419 1228*ce8eb11aSdp78419 1229*ce8eb11aSdp78419 /* 1230*ce8eb11aSdp78419 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1231*ce8eb11aSdp78419 * if any, and remove them. This yields a config where the "coarse 1232*ce8eb11aSdp78419 * grained" lgroups cover all of memory, even though part of that memory 1233*ce8eb11aSdp78419 * is fine grain interleaved and does not deliver a purely local memory 1234*ce8eb11aSdp78419 * latency. 1235*ce8eb11aSdp78419 * 1236*ce8eb11aSdp78419 * This function reads and modifies the globals: 1237*ce8eb11aSdp78419 * mpo_lgroup[], n_lgrpnodes 1238*ce8eb11aSdp78419 * 1239*ce8eb11aSdp78419 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1240*ce8eb11aSdp78419 */ 1241*ce8eb11aSdp78419 1242*ce8eb11aSdp78419 static int 1243*ce8eb11aSdp78419 fix_interleave(void) 1244*ce8eb11aSdp78419 { 1245*ce8eb11aSdp78419 int i, j; 1246*ce8eb11aSdp78419 uint64_t mask = 0; 1247*ce8eb11aSdp78419 1248*ce8eb11aSdp78419 j = 0; 1249*ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 1250*ce8eb11aSdp78419 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1251*ce8eb11aSdp78419 /* remove this lgroup */ 1252*ce8eb11aSdp78419 mask = mpo_lgroup[i].addr_mask; 1253*ce8eb11aSdp78419 } else { 1254*ce8eb11aSdp78419 mpo_lgroup[j++] = mpo_lgroup[i]; 1255*ce8eb11aSdp78419 } 1256*ce8eb11aSdp78419 } 1257*ce8eb11aSdp78419 n_lgrpnodes = j; 1258*ce8eb11aSdp78419 1259*ce8eb11aSdp78419 if (mask != 0) 1260*ce8eb11aSdp78419 MPO_STATUS("sub-page interleave %lx found; " 1261*ce8eb11aSdp78419 "removing lgroup.\n", mask); 1262*ce8eb11aSdp78419 1263*ce8eb11aSdp78419 return (mask != 0); 1264*ce8eb11aSdp78419 } 1265