1ce8eb11aSdp78419 /* 2ce8eb11aSdp78419 * CDDL HEADER START 3ce8eb11aSdp78419 * 4ce8eb11aSdp78419 * The contents of this file are subject to the terms of the 5ce8eb11aSdp78419 * Common Development and Distribution License (the "License"). 6ce8eb11aSdp78419 * You may not use this file except in compliance with the License. 7ce8eb11aSdp78419 * 8ce8eb11aSdp78419 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9ce8eb11aSdp78419 * or http://www.opensolaris.org/os/licensing. 10ce8eb11aSdp78419 * See the License for the specific language governing permissions 11ce8eb11aSdp78419 * and limitations under the License. 12ce8eb11aSdp78419 * 13ce8eb11aSdp78419 * When distributing Covered Code, include this CDDL HEADER in each 14ce8eb11aSdp78419 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15ce8eb11aSdp78419 * If applicable, add the following below this CDDL HEADER, with the 16ce8eb11aSdp78419 * fields enclosed by brackets "[]" replaced with your own identifying 17ce8eb11aSdp78419 * information: Portions Copyright [yyyy] [name of copyright owner] 18ce8eb11aSdp78419 * 19ce8eb11aSdp78419 * CDDL HEADER END 20ce8eb11aSdp78419 */ 21ce8eb11aSdp78419 22ce8eb11aSdp78419 /* 23ce8eb11aSdp78419 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24ce8eb11aSdp78419 * Use is subject to license terms. 25ce8eb11aSdp78419 */ 26ce8eb11aSdp78419 27ce8eb11aSdp78419 #pragma ident "%Z%%M% %I% %E% SMI" 28ce8eb11aSdp78419 29ce8eb11aSdp78419 #include <sys/types.h> 30ce8eb11aSdp78419 #include <sys/sysmacros.h> 31ce8eb11aSdp78419 #include <sys/machsystm.h> 32ce8eb11aSdp78419 #include <sys/machparam.h> 33ce8eb11aSdp78419 #include <sys/cmn_err.h> 34ce8eb11aSdp78419 #include <sys/stat.h> 35ce8eb11aSdp78419 #include <sys/mach_descrip.h> 36ce8eb11aSdp78419 #include <sys/memnode.h> 37ce8eb11aSdp78419 #include <sys/mdesc.h> 38ce8eb11aSdp78419 #include <sys/mpo.h> 39ce8eb11aSdp78419 #include <vm/vm_dep.h> 40*e853d8c3Sjc25722 #include <vm/hat_sfmmu.h> 41ce8eb11aSdp78419 42ce8eb11aSdp78419 /* 43ce8eb11aSdp78419 * MPO and the sun4v memory representation 44ce8eb11aSdp78419 * --------------------------------------- 45ce8eb11aSdp78419 * 46ce8eb11aSdp78419 * Latency groups are defined in the sun4v achitecture by memory-latency-group 47ce8eb11aSdp78419 * nodes in the Machine Description, as specified in FWARC/2007/260. These 48ce8eb11aSdp78419 * tie together cpu nodes and mblock nodes, and contain mask and match 49ce8eb11aSdp78419 * properties that identify the portion of an mblock that belongs to the 50ce8eb11aSdp78419 * lgroup. Mask and match are defined in the Physical Address (PA) space, 51ce8eb11aSdp78419 * but an mblock defines Real Addresses (RA). To translate, the mblock 52ce8eb11aSdp78419 * includes the property address-congruence-offset, hereafter referred to as 53ce8eb11aSdp78419 * ra_to_pa. A real address ra is a member of an lgroup if 54ce8eb11aSdp78419 * 55ce8eb11aSdp78419 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 56ce8eb11aSdp78419 * 57ce8eb11aSdp78419 * The MD is traversed, and information on all mblocks is kept in the array 58ce8eb11aSdp78419 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 59ce8eb11aSdp78419 * to, is kept in the array mpo_cpu[]. 60ce8eb11aSdp78419 * 61ce8eb11aSdp78419 * This implementation makes (and verifies) the simplifying assumption that 62ce8eb11aSdp78419 * the mask bits are the same for all defined lgroups, and that all 1 bits in 63ce8eb11aSdp78419 * the mask are contiguous. Thus the number of lgroups is bounded by the 64ce8eb11aSdp78419 * number of possible mask values, and the lgrp_handle_t is defined as the 65ce8eb11aSdp78419 * mask value, shifted right to eliminate the 0 bit positions in mask. The 66ce8eb11aSdp78419 * masks and values are also referred to as "home bits" in the code. 67ce8eb11aSdp78419 * 68ce8eb11aSdp78419 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 69ce8eb11aSdp78419 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 70ce8eb11aSdp78419 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 71ce8eb11aSdp78419 * home bits. This yields the mem_node. 72ce8eb11aSdp78419 * 73ce8eb11aSdp78419 * Interfaces 74ce8eb11aSdp78419 * ---------- 75ce8eb11aSdp78419 * 76ce8eb11aSdp78419 * This file exports the following entry points: 77ce8eb11aSdp78419 * 78ce8eb11aSdp78419 * plat_lgrp_init() 79ce8eb11aSdp78419 * plat_build_mem_nodes() 80ce8eb11aSdp78419 * plat_lgrp_cpu_to_hand() 81ce8eb11aSdp78419 * plat_lgrp_latency() 82ce8eb11aSdp78419 * plat_pfn_to_mem_node() 83ce8eb11aSdp78419 * These implement the usual platform lgroup interfaces. 84ce8eb11aSdp78419 * 85ce8eb11aSdp78419 * plat_rapfn_to_papfn() 86ce8eb11aSdp78419 * Recover the PA page coloring bits from an RA. 87ce8eb11aSdp78419 * 88ce8eb11aSdp78419 * plat_mem_node_iterator_init() 89ce8eb11aSdp78419 * Initialize an iterator to efficiently step through pages in a mem_node. 90ce8eb11aSdp78419 * 91ce8eb11aSdp78419 * plat_mem_node_intersect_range() 92ce8eb11aSdp78419 * Find the intersection with a mem_node. 93ce8eb11aSdp78419 */ 94ce8eb11aSdp78419 95ce8eb11aSdp78419 int sun4v_mpo_enable = 1; 96ce8eb11aSdp78419 int sun4v_mpo_debug = 0; 97ce8eb11aSdp78419 char sun4v_mpo_status[256] = ""; 98ce8eb11aSdp78419 99ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */ 100ce8eb11aSdp78419 static struct cpu_md mpo_cpu[NCPU]; 101ce8eb11aSdp78419 102ce8eb11aSdp78419 /* Save lgroup info from the MD */ 103ce8eb11aSdp78419 #define MAX_MD_LGROUPS 32 104ce8eb11aSdp78419 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 105ce8eb11aSdp78419 static int n_lgrpnodes = 0; 106ce8eb11aSdp78419 static int n_locality_groups = 0; 107ce8eb11aSdp78419 static int max_locality_groups = 0; 108ce8eb11aSdp78419 109ce8eb11aSdp78419 /* Save mblocks from the MD */ 110ce8eb11aSdp78419 static struct mblock_md mpo_mblock[MPO_MAX_MBLOCKS]; 111ce8eb11aSdp78419 static int n_mblocks = 0; 112ce8eb11aSdp78419 113ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */ 114ce8eb11aSdp78419 static mem_stripe_t mem_stripes[MAX_MEM_STRIPES]; 115ce8eb11aSdp78419 static int n_mem_stripes = 0; 116ce8eb11aSdp78419 static pfn_t mnode_stride; /* distance between stripes, start to start */ 117ce8eb11aSdp78419 static int stripe_shift; /* stride/stripes expressed as a shift */ 118ce8eb11aSdp78419 static pfn_t mnode_pages; /* mem_node stripe width */ 119ce8eb11aSdp78419 120ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */ 121ce8eb11aSdp78419 static uint64_t home_mask = 0; 122ce8eb11aSdp78419 static pfn_t home_mask_pfn = 0; 123ce8eb11aSdp78419 static int home_mask_shift = 0; 124ce8eb11aSdp78419 static uint_t home_mask_pfn_shift = 0; 125ce8eb11aSdp78419 126ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */ 127ce8eb11aSdp78419 static int lower_latency = 0; 128ce8eb11aSdp78419 static int higher_latency = 0; 129ce8eb11aSdp78419 130ce8eb11aSdp78419 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 131ce8eb11aSdp78419 132ce8eb11aSdp78419 static int valid_pages(md_t *md, mde_cookie_t cpu0); 133ce8eb11aSdp78419 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 134ce8eb11aSdp78419 static int fix_interleave(void); 135ce8eb11aSdp78419 136ce8eb11aSdp78419 /* Debug support */ 137ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 138ce8eb11aSdp78419 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 139ce8eb11aSdp78419 #else 140ce8eb11aSdp78419 #define MPO_DEBUG(...) 141ce8eb11aSdp78419 #endif /* DEBUG */ 142ce8eb11aSdp78419 143ce8eb11aSdp78419 /* Record status message, viewable from mdb */ 144ce8eb11aSdp78419 #define MPO_STATUS(args...) { \ 145ce8eb11aSdp78419 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 146ce8eb11aSdp78419 MPO_DEBUG(sun4v_mpo_status); \ 147ce8eb11aSdp78419 } 148ce8eb11aSdp78419 149ce8eb11aSdp78419 /* 150ce8eb11aSdp78419 * Routine to read a uint64_t from a given md 151ce8eb11aSdp78419 */ 152ce8eb11aSdp78419 static int64_t 153ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 154ce8eb11aSdp78419 { 155ce8eb11aSdp78419 int err = md_get_prop_val(md, node, propname, val); 156ce8eb11aSdp78419 return (err); 157ce8eb11aSdp78419 } 158ce8eb11aSdp78419 159ce8eb11aSdp78419 static int 160ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b) 161ce8eb11aSdp78419 { 162ce8eb11aSdp78419 struct mblock_md *m1 = (struct mblock_md *)a; 163ce8eb11aSdp78419 struct mblock_md *m2 = (struct mblock_md *)b; 164ce8eb11aSdp78419 165ce8eb11aSdp78419 if (m1->base < m2->base) 166ce8eb11aSdp78419 return (-1); 167ce8eb11aSdp78419 else if (m1->base == m2->base) 168ce8eb11aSdp78419 return (0); 169ce8eb11aSdp78419 else 170ce8eb11aSdp78419 return (1); 171ce8eb11aSdp78419 } 172ce8eb11aSdp78419 173ce8eb11aSdp78419 static void 174ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n) 175ce8eb11aSdp78419 { 176ce8eb11aSdp78419 extern void qsort(void *, size_t, size_t, 177ce8eb11aSdp78419 int (*)(const void *, const void *)); 178ce8eb11aSdp78419 179ce8eb11aSdp78419 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 180ce8eb11aSdp78419 } 181ce8eb11aSdp78419 182ce8eb11aSdp78419 /* 183ce8eb11aSdp78419 * 184ce8eb11aSdp78419 * Traverse the MD to determine: 185ce8eb11aSdp78419 * 186ce8eb11aSdp78419 * Number of CPU nodes, lgrp_nodes, and mblocks 187ce8eb11aSdp78419 * Then for each lgrp_node, obtain the appropriate data. 188ce8eb11aSdp78419 * For each CPU, determine its home locality and store it. 189ce8eb11aSdp78419 * For each mblock, retrieve its data and store it. 190ce8eb11aSdp78419 */ 191ce8eb11aSdp78419 static int 192ce8eb11aSdp78419 lgrp_traverse(md_t *md) 193ce8eb11aSdp78419 { 194ce8eb11aSdp78419 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 195ce8eb11aSdp78419 uint64_t i, j, k, o, n_nodes; 196ce8eb11aSdp78419 uint64_t n_lgroups = 0; 197ce8eb11aSdp78419 uint64_t mem_lg_homeset = 0; 198ce8eb11aSdp78419 int ret_val = 0; 199ce8eb11aSdp78419 int result = 0; 200ce8eb11aSdp78419 int n_cpunodes = 0; 201ce8eb11aSdp78419 int sub_page_fix; 202ce8eb11aSdp78419 203ce8eb11aSdp78419 n_nodes = md_node_count(md); 204ce8eb11aSdp78419 205ce8eb11aSdp78419 if (n_nodes <= 0) { 206ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 207ce8eb11aSdp78419 ret_val = -1; 208ce8eb11aSdp78419 goto fail; 209ce8eb11aSdp78419 } 210ce8eb11aSdp78419 211ce8eb11aSdp78419 root = md_root_node(md); 212ce8eb11aSdp78419 213ce8eb11aSdp78419 if (root == MDE_INVAL_ELEM_COOKIE) { 214ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 215ce8eb11aSdp78419 ret_val = -1; 216ce8eb11aSdp78419 goto fail; 217ce8eb11aSdp78419 } 218ce8eb11aSdp78419 219ce8eb11aSdp78419 /* 220ce8eb11aSdp78419 * Build the Memory Nodes. Do this before any possibility of 221ce8eb11aSdp78419 * bailing from this routine so we obtain ra_to_pa (needed for page 222ce8eb11aSdp78419 * coloring) even when there are no lgroups defined. 223ce8eb11aSdp78419 */ 224ce8eb11aSdp78419 225ce8eb11aSdp78419 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 226ce8eb11aSdp78419 "fwd", &mblocknodes); 227ce8eb11aSdp78419 228ce8eb11aSdp78419 if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) { 229ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No mblock " 230ce8eb11aSdp78419 "nodes detected in Machine Descriptor\n"); 231ce8eb11aSdp78419 n_mblocks = 0; 232ce8eb11aSdp78419 ret_val = -1; 233ce8eb11aSdp78419 goto fail; 234ce8eb11aSdp78419 } 235ce8eb11aSdp78419 236ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 237ce8eb11aSdp78419 mpo_mblock[i].node = mblocknodes[i]; 238ce8eb11aSdp78419 239ce8eb11aSdp78419 /* Without a base or size value we will fail */ 240ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 241ce8eb11aSdp78419 &mpo_mblock[i].base); 242ce8eb11aSdp78419 if (result < 0) { 243ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 244ce8eb11aSdp78419 "PROP_LG_BASE is missing\n"); 245ce8eb11aSdp78419 n_mblocks = 0; 246ce8eb11aSdp78419 ret_val = -1; 247ce8eb11aSdp78419 goto fail; 248ce8eb11aSdp78419 } 249ce8eb11aSdp78419 250ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 251ce8eb11aSdp78419 &mpo_mblock[i].size); 252ce8eb11aSdp78419 if (result < 0) { 253ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 254ce8eb11aSdp78419 "PROP_LG_SIZE is missing\n"); 255ce8eb11aSdp78419 n_mblocks = 0; 256ce8eb11aSdp78419 ret_val = -1; 257ce8eb11aSdp78419 goto fail; 258ce8eb11aSdp78419 } 259ce8eb11aSdp78419 260ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], 261ce8eb11aSdp78419 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 262ce8eb11aSdp78419 263ce8eb11aSdp78419 /* If we don't have an ra_pa_offset, just set it to 0 */ 264ce8eb11aSdp78419 if (result < 0) 265ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa = 0; 266ce8eb11aSdp78419 267ce8eb11aSdp78419 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 268ce8eb11aSdp78419 "ra_to_pa = %lx\n", i, 269ce8eb11aSdp78419 mpo_mblock[i].base, 270ce8eb11aSdp78419 mpo_mblock[i].size, 271ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa); 272ce8eb11aSdp78419 } 273ce8eb11aSdp78419 274ce8eb11aSdp78419 /* Must sort mblocks by address for mem_node_iterator_init() */ 275ce8eb11aSdp78419 mblock_sort(mpo_mblock, n_mblocks); 276ce8eb11aSdp78419 277ce8eb11aSdp78419 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 278ce8eb11aSdp78419 279ce8eb11aSdp78419 /* Page coloring hook is required so we can iterate through mnodes */ 280ce8eb11aSdp78419 if (&page_next_pfn_for_color_cpu == NULL) { 281ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 282ce8eb11aSdp78419 ret_val = -1; 283ce8eb11aSdp78419 goto fail; 284ce8eb11aSdp78419 } 285ce8eb11aSdp78419 286ce8eb11aSdp78419 /* Global enable for mpo */ 287ce8eb11aSdp78419 if (sun4v_mpo_enable == 0) { 288ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 289ce8eb11aSdp78419 ret_val = -1; 290ce8eb11aSdp78419 goto fail; 291ce8eb11aSdp78419 } 292ce8eb11aSdp78419 293ce8eb11aSdp78419 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 294ce8eb11aSdp78419 "fwd", &lgrpnodes); 295ce8eb11aSdp78419 296ce8eb11aSdp78419 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 297ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 298ce8eb11aSdp78419 ret_val = -1; 299ce8eb11aSdp78419 goto fail; 300ce8eb11aSdp78419 } 301ce8eb11aSdp78419 302ce8eb11aSdp78419 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 303ce8eb11aSdp78419 304ce8eb11aSdp78419 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 305ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 306ce8eb11aSdp78419 "in MD\n"); 307ce8eb11aSdp78419 ret_val = -1; 308ce8eb11aSdp78419 goto fail; 309ce8eb11aSdp78419 } 310ce8eb11aSdp78419 311ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 312ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 313ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 314ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 315ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 316ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 317ce8eb11aSdp78419 318ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 319ce8eb11aSdp78419 mpo_lgroup[i].node = lgrpnodes[i]; 320ce8eb11aSdp78419 mpo_lgroup[i].id = i; 321ce8eb11aSdp78419 mpo_lgroup[i].ncpu = 0; 322ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 323ce8eb11aSdp78419 &mpo_lgroup[i].addr_mask); 324ce8eb11aSdp78419 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 325ce8eb11aSdp78419 &mpo_lgroup[i].addr_match); 326ce8eb11aSdp78419 327ce8eb11aSdp78419 /* 328ce8eb11aSdp78419 * If either the mask or match properties are missing, set to 0 329ce8eb11aSdp78419 */ 330ce8eb11aSdp78419 if (result < 0) { 331ce8eb11aSdp78419 mpo_lgroup[i].addr_mask = 0; 332ce8eb11aSdp78419 mpo_lgroup[i].addr_match = 0; 333ce8eb11aSdp78419 } 334ce8eb11aSdp78419 335ce8eb11aSdp78419 /* Set latency to 0 if property not present */ 336ce8eb11aSdp78419 337ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 338ce8eb11aSdp78419 &mpo_lgroup[i].latency); 339ce8eb11aSdp78419 if (result < 0) 340ce8eb11aSdp78419 mpo_lgroup[i].latency = 0; 341ce8eb11aSdp78419 } 342ce8eb11aSdp78419 343ce8eb11aSdp78419 /* 344ce8eb11aSdp78419 * Sub-page level interleave is not yet supported. Check for it, 345ce8eb11aSdp78419 * and remove sub-page interleaved lgroups from mpo_lgroup and 346ce8eb11aSdp78419 * n_lgrpnodes. If no lgroups are left, return. 347ce8eb11aSdp78419 */ 348ce8eb11aSdp78419 349ce8eb11aSdp78419 sub_page_fix = fix_interleave(); 350ce8eb11aSdp78419 if (n_lgrpnodes == 0) { 351ce8eb11aSdp78419 ret_val = -1; 352ce8eb11aSdp78419 goto fail; 353ce8eb11aSdp78419 } 354ce8eb11aSdp78419 355ce8eb11aSdp78419 /* Ensure that all of the addr_mask values are the same */ 356ce8eb11aSdp78419 357ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 358ce8eb11aSdp78419 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 359ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 360ce8eb11aSdp78419 "addr_mask values are not the same\n"); 361ce8eb11aSdp78419 ret_val = -1; 362ce8eb11aSdp78419 goto fail; 363ce8eb11aSdp78419 } 364ce8eb11aSdp78419 } 365ce8eb11aSdp78419 366ce8eb11aSdp78419 /* 367ce8eb11aSdp78419 * Ensure that all lgrp nodes see all the mblocks. However, if 368ce8eb11aSdp78419 * sub-page interleave is being fixed, they do not, so skip 369ce8eb11aSdp78419 * the check. 370ce8eb11aSdp78419 */ 371ce8eb11aSdp78419 372ce8eb11aSdp78419 if (sub_page_fix == 0) { 373ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 374ce8eb11aSdp78419 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 375ce8eb11aSdp78419 PROP_LG_MBLOCK, "fwd", &nodes); 376ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 377ce8eb11aSdp78419 if (j != n_mblocks) { 378ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 379ce8eb11aSdp78419 "sub-page interleave is being fixed\n"); 380ce8eb11aSdp78419 ret_val = -1; 381ce8eb11aSdp78419 goto fail; 382ce8eb11aSdp78419 } 383ce8eb11aSdp78419 } 384ce8eb11aSdp78419 } 385ce8eb11aSdp78419 386ce8eb11aSdp78419 /* 387ce8eb11aSdp78419 * Use the address mask from the first lgroup node 388ce8eb11aSdp78419 * to establish our home_mask. 389ce8eb11aSdp78419 */ 390ce8eb11aSdp78419 home_mask = mpo_lgroup[0].addr_mask; 391ce8eb11aSdp78419 home_mask_pfn = btop(home_mask); 392ce8eb11aSdp78419 home_mask_shift = lowbit(home_mask) - 1; 393ce8eb11aSdp78419 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 394ce8eb11aSdp78419 mnode_pages = btop(1ULL << home_mask_shift); 395ce8eb11aSdp78419 396ce8eb11aSdp78419 /* 397ce8eb11aSdp78419 * How many values are possible in home mask? Assume the mask 398ce8eb11aSdp78419 * bits are contiguous. 399ce8eb11aSdp78419 */ 400ce8eb11aSdp78419 max_locality_groups = 401ce8eb11aSdp78419 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 402ce8eb11aSdp78419 403ce8eb11aSdp78419 /* Now verify the home mask bits are contiguous */ 404ce8eb11aSdp78419 405ce8eb11aSdp78419 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 406ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 407ce8eb11aSdp78419 "home mask bits are not contiguous\n"); 408ce8eb11aSdp78419 ret_val = -1; 409ce8eb11aSdp78419 goto fail; 410ce8eb11aSdp78419 } 411ce8eb11aSdp78419 412ce8eb11aSdp78419 /* Record all of the home bits */ 413ce8eb11aSdp78419 414ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 415ce8eb11aSdp78419 HOMESET_ADD(mem_lg_homeset, 416ce8eb11aSdp78419 mpo_lgroup[i].addr_match >> home_mask_shift); 417ce8eb11aSdp78419 } 418ce8eb11aSdp78419 419ce8eb11aSdp78419 /* Count the number different "home" mem_lg's we've discovered */ 420ce8eb11aSdp78419 421ce8eb11aSdp78419 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 422ce8eb11aSdp78419 423ce8eb11aSdp78419 /* If we have only 1 locality group then we can exit */ 424ce8eb11aSdp78419 if (n_locality_groups == 1) { 425ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 426ce8eb11aSdp78419 ret_val = -1; 427ce8eb11aSdp78419 goto fail; 428ce8eb11aSdp78419 } 429ce8eb11aSdp78419 430ce8eb11aSdp78419 /* 431ce8eb11aSdp78419 * Set the latencies. A CPU's lgroup is defined by the lowest 432ce8eb11aSdp78419 * latency found. All other memory is considered remote, and the 433ce8eb11aSdp78419 * remote latency is represented by the highest latency found. 434ce8eb11aSdp78419 * Thus hierarchical lgroups, if any, are approximated by a 435ce8eb11aSdp78419 * two level scheme. 436ce8eb11aSdp78419 * 437ce8eb11aSdp78419 * The Solaris MPO framework by convention wants to see latencies 438ce8eb11aSdp78419 * in units of nano-sec/10. In the MD, the units are defined to be 439ce8eb11aSdp78419 * pico-seconds. 440ce8eb11aSdp78419 */ 441ce8eb11aSdp78419 442ce8eb11aSdp78419 lower_latency = mpo_lgroup[0].latency; 443ce8eb11aSdp78419 higher_latency = mpo_lgroup[0].latency; 444ce8eb11aSdp78419 445ce8eb11aSdp78419 for (i = 1; i < n_lgrpnodes; i++) { 446ce8eb11aSdp78419 if (mpo_lgroup[i].latency < lower_latency) { 447ce8eb11aSdp78419 lower_latency = mpo_lgroup[i].latency; 448ce8eb11aSdp78419 } 449ce8eb11aSdp78419 if (mpo_lgroup[i].latency > higher_latency) { 450ce8eb11aSdp78419 higher_latency = mpo_lgroup[i].latency; 451ce8eb11aSdp78419 } 452ce8eb11aSdp78419 } 453ce8eb11aSdp78419 lower_latency /= 10000; 454ce8eb11aSdp78419 higher_latency /= 10000; 455ce8eb11aSdp78419 456ce8eb11aSdp78419 /* Clear our CPU data */ 457ce8eb11aSdp78419 458ce8eb11aSdp78419 for (i = 0; i < NCPU; i++) { 459ce8eb11aSdp78419 mpo_cpu[i].home = 0; 460ce8eb11aSdp78419 mpo_cpu[i].latency = (uint_t)(-1); 461ce8eb11aSdp78419 } 462ce8eb11aSdp78419 463ce8eb11aSdp78419 /* Build the CPU nodes */ 464ce8eb11aSdp78419 for (i = 0; i < n_cpunodes; i++) { 465ce8eb11aSdp78419 466ce8eb11aSdp78419 /* Read in the lgroup nodes */ 467ce8eb11aSdp78419 468ce8eb11aSdp78419 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 469ce8eb11aSdp78419 if (result < 0) { 470ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 471ce8eb11aSdp78419 ret_val = -1; 472ce8eb11aSdp78419 goto fail; 473ce8eb11aSdp78419 } 474ce8eb11aSdp78419 475ce8eb11aSdp78419 n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, 476ce8eb11aSdp78419 "fwd", &nodes); 477ce8eb11aSdp78419 if (n_lgroups <= 0) { 478ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); 479ce8eb11aSdp78419 ret_val = -1; 480ce8eb11aSdp78419 goto fail; 481ce8eb11aSdp78419 } 482ce8eb11aSdp78419 483ce8eb11aSdp78419 /* 484ce8eb11aSdp78419 * Find the lgroup this cpu belongs to with the lowest latency. 485ce8eb11aSdp78419 * Check all the lgrp nodes connected to this CPU to determine 486ce8eb11aSdp78419 * which has the smallest latency. 487ce8eb11aSdp78419 */ 488ce8eb11aSdp78419 489ce8eb11aSdp78419 for (j = 0; j < n_lgroups; j++) { 490ce8eb11aSdp78419 for (o = 0; o < n_lgrpnodes; o++) { 491ce8eb11aSdp78419 if (nodes[j] == mpo_lgroup[o].node) { 492ce8eb11aSdp78419 if (mpo_lgroup[o].latency < 493ce8eb11aSdp78419 mpo_cpu[k].latency) { 494ce8eb11aSdp78419 mpo_cpu[k].home = 495ce8eb11aSdp78419 mpo_lgroup[o].addr_match 496ce8eb11aSdp78419 >> home_mask_shift; 497ce8eb11aSdp78419 mpo_cpu[k].latency = 498ce8eb11aSdp78419 mpo_lgroup[o].latency; 499ce8eb11aSdp78419 mpo_lgroup[o].ncpu++; 500ce8eb11aSdp78419 } 501ce8eb11aSdp78419 } 502ce8eb11aSdp78419 } 503ce8eb11aSdp78419 } 504ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 505ce8eb11aSdp78419 } 506ce8eb11aSdp78419 507ce8eb11aSdp78419 /* Validate that no large pages cross mnode boundaries. */ 508ce8eb11aSdp78419 if (valid_pages(md, cpunodes[0]) == 0) { 509ce8eb11aSdp78419 ret_val = -1; 510ce8eb11aSdp78419 goto fail; 511ce8eb11aSdp78419 } 512ce8eb11aSdp78419 513ce8eb11aSdp78419 fail: 514ce8eb11aSdp78419 /* MD cookies are no longer valid; ensure they are not used again. */ 515ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) 516ce8eb11aSdp78419 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 517ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) 518ce8eb11aSdp78419 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 519ce8eb11aSdp78419 520ce8eb11aSdp78419 if (n_cpunodes > 0) 521ce8eb11aSdp78419 md_free_scan_dag(md, &cpunodes); 522ce8eb11aSdp78419 if (n_lgrpnodes > 0) 523ce8eb11aSdp78419 md_free_scan_dag(md, &lgrpnodes); 524ce8eb11aSdp78419 if (n_mblocks > 0) 525ce8eb11aSdp78419 md_free_scan_dag(md, &mblocknodes); 526ce8eb11aSdp78419 else 527ce8eb11aSdp78419 panic("lgrp_traverse: No memory blocks found"); 528ce8eb11aSdp78419 529ce8eb11aSdp78419 if (ret_val == 0) 530ce8eb11aSdp78419 MPO_STATUS("MPO feature is enabled.\n"); 531ce8eb11aSdp78419 532ce8eb11aSdp78419 return (ret_val); 533ce8eb11aSdp78419 } 534ce8eb11aSdp78419 535ce8eb11aSdp78419 /* 536ce8eb11aSdp78419 * Determine the number of unique mem_lg's present in our system 537ce8eb11aSdp78419 */ 538ce8eb11aSdp78419 static int 539ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 540ce8eb11aSdp78419 { 541ce8eb11aSdp78419 int homeid; 542ce8eb11aSdp78419 int count = 0; 543ce8eb11aSdp78419 544ce8eb11aSdp78419 /* 545ce8eb11aSdp78419 * Scan the "home" bits of the mem_lgs, count 546ce8eb11aSdp78419 * the number that are unique. 547ce8eb11aSdp78419 */ 548ce8eb11aSdp78419 549ce8eb11aSdp78419 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 550ce8eb11aSdp78419 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 551ce8eb11aSdp78419 count++; 552ce8eb11aSdp78419 } 553ce8eb11aSdp78419 } 554ce8eb11aSdp78419 555ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 556ce8eb11aSdp78419 mem_lg_homeset); 557ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 558ce8eb11aSdp78419 559ce8eb11aSdp78419 /* Default must be at least one */ 560ce8eb11aSdp78419 if (count == 0) 561ce8eb11aSdp78419 count = 1; 562ce8eb11aSdp78419 563ce8eb11aSdp78419 return (count); 564ce8eb11aSdp78419 } 565ce8eb11aSdp78419 566ce8eb11aSdp78419 /* 567ce8eb11aSdp78419 * Platform specific lgroup initialization 568ce8eb11aSdp78419 */ 569ce8eb11aSdp78419 void 570ce8eb11aSdp78419 plat_lgrp_init(void) 571ce8eb11aSdp78419 { 572ce8eb11aSdp78419 md_t *md; 573ce8eb11aSdp78419 int i, rc, ncpu_min; 574ce8eb11aSdp78419 575ce8eb11aSdp78419 /* Get the Machine Descriptor handle */ 576ce8eb11aSdp78419 577ce8eb11aSdp78419 md = md_get_handle(); 578ce8eb11aSdp78419 579ce8eb11aSdp78419 /* If not, we cannot continue */ 580ce8eb11aSdp78419 581ce8eb11aSdp78419 if (md == NULL) { 582ce8eb11aSdp78419 panic("cannot access machine descriptor\n"); 583ce8eb11aSdp78419 } else { 584ce8eb11aSdp78419 rc = lgrp_traverse(md); 585ce8eb11aSdp78419 (void) md_fini_handle(md); 586ce8eb11aSdp78419 } 587ce8eb11aSdp78419 588ce8eb11aSdp78419 /* 589ce8eb11aSdp78419 * If we can't process the MD for lgroups then at least let the 590ce8eb11aSdp78419 * system try to boot. Assume we have one lgroup so that 591ce8eb11aSdp78419 * when plat_build_mem_nodes is called, it will attempt to init 592ce8eb11aSdp78419 * an mnode based on the supplied memory segment. 593ce8eb11aSdp78419 */ 594ce8eb11aSdp78419 595ce8eb11aSdp78419 if (rc == -1) { 596ce8eb11aSdp78419 home_mask_pfn = 0; 597ce8eb11aSdp78419 max_locality_groups = 1; 598ce8eb11aSdp78419 n_locality_groups = 1; 599ce8eb11aSdp78419 return; 600ce8eb11aSdp78419 } 601ce8eb11aSdp78419 602ce8eb11aSdp78419 mem_node_pfn_shift = 0; 603ce8eb11aSdp78419 mem_node_physalign = 0; 604ce8eb11aSdp78419 605ce8eb11aSdp78419 /* Use lgroup-aware TSB allocations */ 606ce8eb11aSdp78419 tsb_lgrp_affinity = 1; 607ce8eb11aSdp78419 608ce8eb11aSdp78419 /* 609ce8eb11aSdp78419 * lgrp_expand_proc_thresh is the minimum load on the lgroups 610ce8eb11aSdp78419 * this process is currently running on before considering 611ce8eb11aSdp78419 * expanding threads to another lgroup. 612ce8eb11aSdp78419 * 613ce8eb11aSdp78419 * lgrp_expand_proc_diff determines how much less the remote lgroup 614ce8eb11aSdp78419 * must be loaded before expanding to it. 615ce8eb11aSdp78419 * 616ce8eb11aSdp78419 * On sun4v CMT processors, threads share a core pipeline, and 617ce8eb11aSdp78419 * at less than 100% utilization, best throughput is obtained by 618ce8eb11aSdp78419 * spreading threads across more cores, even if some are in a 619ce8eb11aSdp78419 * different lgroup. Spread threads to a new lgroup if the 620ce8eb11aSdp78419 * current group is more than 50% loaded. Because of virtualization, 621ce8eb11aSdp78419 * lgroups may have different numbers of CPUs, but the tunables 622ce8eb11aSdp78419 * apply to all lgroups, so find the smallest lgroup and compute 623ce8eb11aSdp78419 * 50% loading. 624ce8eb11aSdp78419 */ 625ce8eb11aSdp78419 626ce8eb11aSdp78419 ncpu_min = NCPU; 627ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 628ce8eb11aSdp78419 int ncpu = mpo_lgroup[i].ncpu; 629ce8eb11aSdp78419 if (ncpu != 0 && ncpu < ncpu_min) 630ce8eb11aSdp78419 ncpu_min = ncpu; 631ce8eb11aSdp78419 } 632ce8eb11aSdp78419 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 633ce8eb11aSdp78419 634ce8eb11aSdp78419 /* new home may only be half as loaded as the existing home to use it */ 635ce8eb11aSdp78419 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 636ce8eb11aSdp78419 637ce8eb11aSdp78419 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 638ce8eb11aSdp78419 639ce8eb11aSdp78419 /* Require that a home lgroup have some memory to be chosen */ 640ce8eb11aSdp78419 lgrp_mem_free_thresh = 1; 641ce8eb11aSdp78419 642ce8eb11aSdp78419 /* Standard home-on-next-touch policy */ 643ce8eb11aSdp78419 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 644ce8eb11aSdp78419 645ce8eb11aSdp78419 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 646ce8eb11aSdp78419 lgrp_load_thresh = UINT32_MAX; 647ce8eb11aSdp78419 } 648ce8eb11aSdp78419 649ce8eb11aSdp78419 /* 650ce8eb11aSdp78419 * Helper routine for debugging calls to mem_node_add_slice() 651ce8eb11aSdp78419 */ 652ce8eb11aSdp78419 static void 653ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 654ce8eb11aSdp78419 { 655ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 656ce8eb11aSdp78419 static int slice_count = 0; 657ce8eb11aSdp78419 658ce8eb11aSdp78419 slice_count++; 659ce8eb11aSdp78419 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 660ce8eb11aSdp78419 slice_count, basepfn, endpfn); 661ce8eb11aSdp78419 #endif 662ce8eb11aSdp78419 mem_node_add_slice(basepfn, endpfn); 663ce8eb11aSdp78419 } 664ce8eb11aSdp78419 665ce8eb11aSdp78419 /* 666ce8eb11aSdp78419 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 667ce8eb11aSdp78419 */ 668ce8eb11aSdp78419 static void 669ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 670ce8eb11aSdp78419 { 671ce8eb11aSdp78419 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 672ce8eb11aSdp78419 "mnode index: %d\n", plathand, mnode); 673ce8eb11aSdp78419 plat_assign_lgrphand_to_mem_node(plathand, mnode); 674ce8eb11aSdp78419 } 675ce8eb11aSdp78419 676ce8eb11aSdp78419 /* 677ce8eb11aSdp78419 * plat_build_mem_nodes() 678ce8eb11aSdp78419 * 679ce8eb11aSdp78419 * Define the mem_nodes based on the modified boot memory list, 680ce8eb11aSdp78419 * or based on info read from the MD in plat_lgrp_init(). 681ce8eb11aSdp78419 * 682ce8eb11aSdp78419 * When the home mask lies in the middle of the address bits (as it does on 683ce8eb11aSdp78419 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 684ce8eb11aSdp78419 * it is striped across an mblock in a repeating pattern of contiguous memory 685ce8eb11aSdp78419 * followed by a gap. The stripe width is the size of the contiguous piece. 686ce8eb11aSdp78419 * The stride is the distance from the start of one contiguous piece to the 687ce8eb11aSdp78419 * start of the next. The gap is thus stride - stripe_width. 688ce8eb11aSdp78419 * 689ce8eb11aSdp78419 * The stripe of an mnode that falls within an mblock is described by the type 690ce8eb11aSdp78419 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 691ce8eb11aSdp78419 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 692ce8eb11aSdp78419 * this array is predetermined. The mem_stripe_t that describes mnode m 693ce8eb11aSdp78419 * within mpo_mblock[i] is stored at 694ce8eb11aSdp78419 * mem_stripes[ m + i * max_locality_groups ] 695ce8eb11aSdp78419 * 696ce8eb11aSdp78419 * max_locality_groups is the total number of possible locality groups, 697ce8eb11aSdp78419 * as defined by the size of the home mask, even if the memory assigned 698ce8eb11aSdp78419 * to the domain is small and does not cover all the lgroups. Thus some 699ce8eb11aSdp78419 * mem_stripe_t's may be empty. 700ce8eb11aSdp78419 * 701ce8eb11aSdp78419 * The members of mem_stripe_t are: 702ce8eb11aSdp78419 * physbase: First valid page in mem_node in the corresponding mblock 703ce8eb11aSdp78419 * physmax: Last valid page in mem_node in mblock 704ce8eb11aSdp78419 * offset: The full stripe width starts at physbase - offset. 705ce8eb11aSdp78419 * Thus if offset is non-zero, this mem_node starts in the middle 706ce8eb11aSdp78419 * of a stripe width, and the second full stripe starts at 707ce8eb11aSdp78419 * physbase - offset + stride. (even though physmax may fall in the 708ce8eb11aSdp78419 * middle of a stripe width, we do not save the ending fragment size 709ce8eb11aSdp78419 * in this data structure.) 710ce8eb11aSdp78419 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 711ce8eb11aSdp78419 * 712ce8eb11aSdp78419 * The stripe width is kept in the global mnode_pages. 713ce8eb11aSdp78419 * The stride is kept in the global mnode_stride. 714ce8eb11aSdp78419 * All the above use pfn's as the unit. 715ce8eb11aSdp78419 * 716ce8eb11aSdp78419 * As an example, the memory layout for a domain with 2 mblocks and 4 717ce8eb11aSdp78419 * mem_nodes 0,1,2,3 could look like this: 718ce8eb11aSdp78419 * 719ce8eb11aSdp78419 * 123012301230 ... 012301230123 ... 720ce8eb11aSdp78419 * mblock 0 mblock 1 721ce8eb11aSdp78419 */ 722ce8eb11aSdp78419 723ce8eb11aSdp78419 void 724ce8eb11aSdp78419 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 725ce8eb11aSdp78419 { 726ce8eb11aSdp78419 lgrp_handle_t lgrphand, lgrp_start; 727ce8eb11aSdp78419 int i, mnode, elem; 728ce8eb11aSdp78419 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 729ce8eb11aSdp78419 uint64_t stripe, frag, remove; 730ce8eb11aSdp78419 mem_stripe_t *ms; 731ce8eb11aSdp78419 732*e853d8c3Sjc25722 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 733*e853d8c3Sjc25722 max_mem_nodes = max_locality_groups; 734ce8eb11aSdp78419 735*e853d8c3Sjc25722 /* Check for non-MPO sun4v platforms */ 736ce8eb11aSdp78419 if (n_locality_groups <= 1) { 737*e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 738ce8eb11aSdp78419 for (elem = 0; elem < nelems; elem += 2) { 739ce8eb11aSdp78419 base = list[elem]; 740ce8eb11aSdp78419 len = list[elem+1]; 741ce8eb11aSdp78419 742ce8eb11aSdp78419 mpo_mem_node_add_slice(btop(base), 743ce8eb11aSdp78419 btop(base + len - 1)); 744ce8eb11aSdp78419 } 745ce8eb11aSdp78419 mem_node_pfn_shift = 0; 746ce8eb11aSdp78419 mem_node_physalign = 0; 747ce8eb11aSdp78419 n_mem_stripes = 0; 748*e853d8c3Sjc25722 if (n_mblocks == 1) 749ce8eb11aSdp78419 return; 750ce8eb11aSdp78419 } 751ce8eb11aSdp78419 752ce8eb11aSdp78419 bzero(mem_stripes, sizeof (mem_stripes)); 753ce8eb11aSdp78419 stripe = ptob(mnode_pages); 754ce8eb11aSdp78419 stride = max_locality_groups * stripe; 755ce8eb11aSdp78419 756ce8eb11aSdp78419 /* Save commonly used values in globals */ 757ce8eb11aSdp78419 mnode_stride = btop(stride); 758ce8eb11aSdp78419 n_mem_stripes = max_locality_groups * n_mblocks; 759ce8eb11aSdp78419 stripe_shift = highbit(max_locality_groups) - 1; 760ce8eb11aSdp78419 761ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 762ce8eb11aSdp78419 763ce8eb11aSdp78419 base = mpo_mblock[i].base; 764ce8eb11aSdp78419 end = mpo_mblock[i].base + mpo_mblock[i].size; 765ce8eb11aSdp78419 ra_to_pa = mpo_mblock[i].ra_to_pa; 766ce8eb11aSdp78419 mpo_mblock[i].base_pfn = btop(base); 767ce8eb11aSdp78419 mpo_mblock[i].end_pfn = btop(end - 1); 768ce8eb11aSdp78419 769ce8eb11aSdp78419 /* Find the offset from the prev stripe boundary in PA space. */ 770ce8eb11aSdp78419 offset = (base + ra_to_pa) & (stripe - 1); 771ce8eb11aSdp78419 772ce8eb11aSdp78419 /* Set the next stripe boundary. */ 773ce8eb11aSdp78419 stripe_end = base - offset + stripe; 774ce8eb11aSdp78419 775ce8eb11aSdp78419 lgrp_start = (((base + ra_to_pa) & home_mask) >> 776ce8eb11aSdp78419 home_mask_shift); 777ce8eb11aSdp78419 lgrphand = lgrp_start; 778ce8eb11aSdp78419 779ce8eb11aSdp78419 /* 780ce8eb11aSdp78419 * Loop over all lgroups covered by the mblock, creating a 781ce8eb11aSdp78419 * stripe for each. Stop when lgrp_start is visited again. 782ce8eb11aSdp78419 */ 783ce8eb11aSdp78419 do { 784ce8eb11aSdp78419 /* mblock may not span all lgroups */ 785ce8eb11aSdp78419 if (base >= end) 786ce8eb11aSdp78419 break; 787ce8eb11aSdp78419 788ce8eb11aSdp78419 mnode = lgrphand; 789ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 790ce8eb11aSdp78419 791ce8eb11aSdp78419 /* 792ce8eb11aSdp78419 * Calculate the size of the fragment that does not 793ce8eb11aSdp78419 * belong to the mnode in the last partial stride. 794ce8eb11aSdp78419 */ 795ce8eb11aSdp78419 frag = (end - (base - offset)) & (stride - 1); 796ce8eb11aSdp78419 if (frag == 0) { 797ce8eb11aSdp78419 /* remove the gap */ 798ce8eb11aSdp78419 remove = stride - stripe; 799ce8eb11aSdp78419 } else if (frag < stripe) { 800ce8eb11aSdp78419 /* fragment fits in stripe; keep it all */ 801ce8eb11aSdp78419 remove = 0; 802ce8eb11aSdp78419 } else { 803ce8eb11aSdp78419 /* fragment is large; trim after whole stripe */ 804ce8eb11aSdp78419 remove = frag - stripe; 805ce8eb11aSdp78419 } 806ce8eb11aSdp78419 807ce8eb11aSdp78419 ms = &mem_stripes[i * max_locality_groups + mnode]; 808ce8eb11aSdp78419 ms->physbase = btop(base); 809ce8eb11aSdp78419 ms->physmax = btop(end - 1 - remove); 810ce8eb11aSdp78419 ms->offset = btop(offset); 811ce8eb11aSdp78419 ms->exists = 1; 812ce8eb11aSdp78419 813*e853d8c3Sjc25722 /* 814*e853d8c3Sjc25722 * If we have only 1 lgroup and multiple mblocks, 815*e853d8c3Sjc25722 * then we have already established our lgrp handle 816*e853d8c3Sjc25722 * to mem_node and mem_node_config values above. 817*e853d8c3Sjc25722 */ 818*e853d8c3Sjc25722 if (n_locality_groups > 1) { 819*e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 820*e853d8c3Sjc25722 mnode); 821*e853d8c3Sjc25722 mpo_mem_node_add_slice(ms->physbase, 822*e853d8c3Sjc25722 ms->physmax); 823*e853d8c3Sjc25722 } 824ce8eb11aSdp78419 base = stripe_end; 825ce8eb11aSdp78419 stripe_end += stripe; 826ce8eb11aSdp78419 offset = 0; 827ce8eb11aSdp78419 lgrphand = (((base + ra_to_pa) & home_mask) >> 828ce8eb11aSdp78419 home_mask_shift); 829ce8eb11aSdp78419 } while (lgrphand != lgrp_start); 830ce8eb11aSdp78419 } 831ce8eb11aSdp78419 832ce8eb11aSdp78419 /* 833ce8eb11aSdp78419 * Indicate to vm_pagelist that the hpm_counters array 834ce8eb11aSdp78419 * should be shared because the ranges overlap. 835ce8eb11aSdp78419 */ 836ce8eb11aSdp78419 if (max_mem_nodes > 1) { 837ce8eb11aSdp78419 interleaved_mnodes = 1; 838ce8eb11aSdp78419 } 839ce8eb11aSdp78419 } 840ce8eb11aSdp78419 841ce8eb11aSdp78419 /* 842ce8eb11aSdp78419 * Return the locality group value for the supplied processor 843ce8eb11aSdp78419 */ 844ce8eb11aSdp78419 lgrp_handle_t 845ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id) 846ce8eb11aSdp78419 { 847ce8eb11aSdp78419 if (n_locality_groups > 1) { 848ce8eb11aSdp78419 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 849ce8eb11aSdp78419 } else { 850*e853d8c3Sjc25722 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 851ce8eb11aSdp78419 } 852ce8eb11aSdp78419 } 853ce8eb11aSdp78419 854ce8eb11aSdp78419 int 855ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 856ce8eb11aSdp78419 { 857ce8eb11aSdp78419 /* 858ce8eb11aSdp78419 * Return min remote latency when there are more than two lgroups 859ce8eb11aSdp78419 * (root and child) and getting latency between two different lgroups 860ce8eb11aSdp78419 * or root is involved. 861ce8eb11aSdp78419 */ 862ce8eb11aSdp78419 if (lgrp_optimizations() && (from != to || 863ce8eb11aSdp78419 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 864ce8eb11aSdp78419 return ((int)higher_latency); 865ce8eb11aSdp78419 } else { 866ce8eb11aSdp78419 return ((int)lower_latency); 867ce8eb11aSdp78419 } 868ce8eb11aSdp78419 } 869ce8eb11aSdp78419 870ce8eb11aSdp78419 int 871ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn) 872ce8eb11aSdp78419 { 873ce8eb11aSdp78419 int i, mnode; 874ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 875ce8eb11aSdp78419 struct mblock_md *mb; 876ce8eb11aSdp78419 877ce8eb11aSdp78419 if (n_locality_groups <= 1) 878ce8eb11aSdp78419 return (0); 879ce8eb11aSdp78419 880ce8eb11aSdp78419 /* 881ce8eb11aSdp78419 * The mnode is defined to be 1:1 with the lgroup handle, which 882ce8eb11aSdp78419 * is taken from from the home bits. Find the mblock in which 883ce8eb11aSdp78419 * the pfn falls to get the ra_to_pa adjustment, and extract 884ce8eb11aSdp78419 * the home bits. 885ce8eb11aSdp78419 */ 886ce8eb11aSdp78419 mb = &mpo_mblock[0]; 887ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 888ce8eb11aSdp78419 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 889ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 890ce8eb11aSdp78419 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 891ce8eb11aSdp78419 home_mask_pfn_shift); 892ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 893ce8eb11aSdp78419 return (mnode); 894ce8eb11aSdp78419 } 895ce8eb11aSdp78419 mb++; 896ce8eb11aSdp78419 } 897ce8eb11aSdp78419 898ce8eb11aSdp78419 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 899ce8eb11aSdp78419 return (pfn); 900ce8eb11aSdp78419 } 901ce8eb11aSdp78419 902ce8eb11aSdp78419 /* 903ce8eb11aSdp78419 * plat_rapfn_to_papfn 904ce8eb11aSdp78419 * 905ce8eb11aSdp78419 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 906ce8eb11aSdp78419 * and home mask bits are correct. The upper bits do not necessarily 907ce8eb11aSdp78419 * match the actual PA, however. 908ce8eb11aSdp78419 */ 909ce8eb11aSdp78419 pfn_t 910ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn) 911ce8eb11aSdp78419 { 912ce8eb11aSdp78419 int i; 913ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 914ce8eb11aSdp78419 struct mblock_md *mb; 915ce8eb11aSdp78419 916ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 917ce8eb11aSdp78419 if (n_mblocks == 1) 918ce8eb11aSdp78419 return (pfn + base_ra_to_pa_pfn); 919ce8eb11aSdp78419 920ce8eb11aSdp78419 /* 921ce8eb11aSdp78419 * Find the mblock in which the pfn falls 922ce8eb11aSdp78419 * in order to get the ra_to_pa adjustment. 923ce8eb11aSdp78419 */ 924ce8eb11aSdp78419 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 925ce8eb11aSdp78419 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 926ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 927ce8eb11aSdp78419 return (pfn + ra_to_pa_pfn); 928ce8eb11aSdp78419 } 929ce8eb11aSdp78419 } 930ce8eb11aSdp78419 931ce8eb11aSdp78419 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 932ce8eb11aSdp78419 return (pfn); 933ce8eb11aSdp78419 } 934ce8eb11aSdp78419 935ce8eb11aSdp78419 /* 936ce8eb11aSdp78419 * plat_mem_node_iterator_init() 937ce8eb11aSdp78419 * Initialize cookie to iterate over pfn's in an mnode. There is 938ce8eb11aSdp78419 * no additional iterator function. The caller uses the info from 939ce8eb11aSdp78419 * the iterator structure directly. 940ce8eb11aSdp78419 * 941ce8eb11aSdp78419 * pfn: starting pfn. 942ce8eb11aSdp78419 * mnode: desired mnode. 943ce8eb11aSdp78419 * init: set to 1 for full init, 0 for continuation 944ce8eb11aSdp78419 * 945ce8eb11aSdp78419 * Returns the appropriate starting pfn for the iteration 946ce8eb11aSdp78419 * the same as the input pfn if it falls in an mblock. 947ce8eb11aSdp78419 * Returns the (pfn_t)-1 value if the input pfn lies past 948ce8eb11aSdp78419 * the last valid mnode pfn. 949ce8eb11aSdp78419 */ 950ce8eb11aSdp78419 pfn_t 951ce8eb11aSdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 952ce8eb11aSdp78419 mem_node_iterator_t *it, int init) 953ce8eb11aSdp78419 { 954ce8eb11aSdp78419 int i; 955ce8eb11aSdp78419 struct mblock_md *mblock; 956ce8eb11aSdp78419 pfn_t base, end; 957ce8eb11aSdp78419 958ce8eb11aSdp78419 ASSERT(it != NULL); 959ce8eb11aSdp78419 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 960ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 961ce8eb11aSdp78419 962ce8eb11aSdp78419 if (init) { 963ce8eb11aSdp78419 it->mi_last_mblock = 0; 964ce8eb11aSdp78419 it->mi_init = 1; 965ce8eb11aSdp78419 } 966ce8eb11aSdp78419 967ce8eb11aSdp78419 /* Check if mpo is not enabled and we only have one mblock */ 968ce8eb11aSdp78419 if (n_locality_groups == 1 && n_mblocks == 1) { 969ce8eb11aSdp78419 it->mi_mnode = mnode; 970ce8eb11aSdp78419 it->mi_ra_to_pa = base_ra_to_pa_pfn; 971ce8eb11aSdp78419 it->mi_mnode_pfn_mask = 0; 972ce8eb11aSdp78419 it->mi_mnode_pfn_shift = 0; 973ce8eb11aSdp78419 it->mi_mnode_mask = 0; 974ce8eb11aSdp78419 it->mi_mblock_base = mem_node_config[mnode].physbase; 975ce8eb11aSdp78419 it->mi_mblock_end = mem_node_config[mnode].physmax; 976ce8eb11aSdp78419 if (pfn < it->mi_mblock_base) 977ce8eb11aSdp78419 pfn = it->mi_mblock_base; 978ce8eb11aSdp78419 else if (pfn > it->mi_mblock_end) 979ce8eb11aSdp78419 pfn = (pfn_t)-1; 980ce8eb11aSdp78419 return (pfn); 981ce8eb11aSdp78419 } 982ce8eb11aSdp78419 983ce8eb11aSdp78419 /* 984ce8eb11aSdp78419 * Find mblock that contains pfn, or first mblock after pfn, 985ce8eb11aSdp78419 * else pfn is out of bounds, so use the last mblock. 986ce8eb11aSdp78419 * mblocks are sorted in ascending address order. 987ce8eb11aSdp78419 */ 988ce8eb11aSdp78419 ASSERT(it->mi_last_mblock < n_mblocks); 989ce8eb11aSdp78419 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 990ce8eb11aSdp78419 i = init ? 0 : it->mi_last_mblock + 1; 991ce8eb11aSdp78419 if (i == n_mblocks) 992ce8eb11aSdp78419 return ((pfn_t)-1); 993ce8eb11aSdp78419 994ce8eb11aSdp78419 for (; i < n_mblocks; i++) { 995ce8eb11aSdp78419 if (pfn <= mpo_mblock[i].end_pfn) 996ce8eb11aSdp78419 break; 997ce8eb11aSdp78419 } 998ce8eb11aSdp78419 if (i == n_mblocks) { 999ce8eb11aSdp78419 it->mi_last_mblock = i - 1; 1000ce8eb11aSdp78419 return ((pfn_t)-1); 1001ce8eb11aSdp78419 } 1002ce8eb11aSdp78419 it->mi_last_mblock = i; 1003ce8eb11aSdp78419 1004ce8eb11aSdp78419 /* 1005ce8eb11aSdp78419 * Memory stripes are defined if there is more than one locality 1006ce8eb11aSdp78419 * group, so use the stripe bounds. Otherwise use mblock bounds. 1007ce8eb11aSdp78419 */ 1008ce8eb11aSdp78419 mblock = &mpo_mblock[i]; 1009ce8eb11aSdp78419 if (n_mem_stripes > 0) { 1010ce8eb11aSdp78419 mem_stripe_t *ms = 1011ce8eb11aSdp78419 &mem_stripes[i * max_locality_groups + mnode]; 1012ce8eb11aSdp78419 base = ms->physbase; 1013ce8eb11aSdp78419 end = ms->physmax; 1014ce8eb11aSdp78419 } else { 1015ce8eb11aSdp78419 ASSERT(mnode == 0); 1016ce8eb11aSdp78419 base = mblock->base_pfn; 1017ce8eb11aSdp78419 end = mblock->end_pfn; 1018ce8eb11aSdp78419 } 1019ce8eb11aSdp78419 1020ce8eb11aSdp78419 it->mi_mnode = mnode; 1021ce8eb11aSdp78419 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1022ce8eb11aSdp78419 it->mi_mblock_base = base; 1023ce8eb11aSdp78419 it->mi_mblock_end = end; 1024ce8eb11aSdp78419 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1025ce8eb11aSdp78419 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1026ce8eb11aSdp78419 it->mi_mnode_mask = max_locality_groups - 1; 1027ce8eb11aSdp78419 if (pfn < base) 1028ce8eb11aSdp78419 pfn = base; 1029ce8eb11aSdp78419 else if (pfn > end) 1030ce8eb11aSdp78419 pfn = (pfn_t)-1; 1031ce8eb11aSdp78419 return (pfn); 1032ce8eb11aSdp78419 } 1033ce8eb11aSdp78419 1034ce8eb11aSdp78419 /* 1035ce8eb11aSdp78419 * plat_mem_node_intersect_range() 1036ce8eb11aSdp78419 * 1037ce8eb11aSdp78419 * Find the intersection between a memnode and a range of pfn's. 1038ce8eb11aSdp78419 */ 1039ce8eb11aSdp78419 void 1040ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1041ce8eb11aSdp78419 int mnode, pgcnt_t *npages_out) 1042ce8eb11aSdp78419 { 1043ce8eb11aSdp78419 pfn_t offset, len, hole, base, end, test_end, frag; 1044ce8eb11aSdp78419 pfn_t nearest; 1045ce8eb11aSdp78419 mem_stripe_t *ms; 1046ce8eb11aSdp78419 int i, npages; 1047ce8eb11aSdp78419 1048ce8eb11aSdp78419 *npages_out = 0; 1049ce8eb11aSdp78419 1050ce8eb11aSdp78419 if (!mem_node_config[mnode].exists || test_len == 0) 1051ce8eb11aSdp78419 return; 1052ce8eb11aSdp78419 1053ce8eb11aSdp78419 base = mem_node_config[mnode].physbase; 1054ce8eb11aSdp78419 end = mem_node_config[mnode].physmax; 1055ce8eb11aSdp78419 1056ce8eb11aSdp78419 test_end = test_base + test_len - 1; 1057ce8eb11aSdp78419 if (end < test_base || base > test_end) 1058ce8eb11aSdp78419 return; 1059ce8eb11aSdp78419 1060ce8eb11aSdp78419 if (n_locality_groups == 1) { 1061ce8eb11aSdp78419 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1062ce8eb11aSdp78419 return; 1063ce8eb11aSdp78419 } 1064ce8eb11aSdp78419 1065ce8eb11aSdp78419 hole = mnode_stride - mnode_pages; 1066ce8eb11aSdp78419 npages = 0; 1067ce8eb11aSdp78419 1068ce8eb11aSdp78419 /* 1069ce8eb11aSdp78419 * Iterate over all the stripes for this mnode (one per mblock), 1070ce8eb11aSdp78419 * find the intersection with each, and accumulate the intersections. 1071ce8eb11aSdp78419 * 1072ce8eb11aSdp78419 * Determing the intersection with a stripe is tricky. If base or end 1073ce8eb11aSdp78419 * fall outside the mem_node bounds, round them to physbase/physmax of 1074ce8eb11aSdp78419 * mem_node. If base or end fall in a gap, round them to start of 1075ce8eb11aSdp78419 * nearest stripe. If they fall within a stripe, keep base or end, 1076ce8eb11aSdp78419 * but calculate the fragment size that should be excluded from the 1077ce8eb11aSdp78419 * stripe. Calculate how many strides fall in the adjusted range, 1078ce8eb11aSdp78419 * multiply by stripe width, and add the start and end fragments. 1079ce8eb11aSdp78419 */ 1080ce8eb11aSdp78419 1081ce8eb11aSdp78419 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1082ce8eb11aSdp78419 ms = &mem_stripes[i]; 1083ce8eb11aSdp78419 if (ms->exists && 1084ce8eb11aSdp78419 test_base <= (end = ms->physmax) && 1085ce8eb11aSdp78419 test_end >= (base = ms->physbase)) { 1086ce8eb11aSdp78419 1087ce8eb11aSdp78419 offset = ms->offset; 1088ce8eb11aSdp78419 1089ce8eb11aSdp78419 if (test_base > base) { 1090ce8eb11aSdp78419 /* Round test_base to next multiple of stride */ 1091ce8eb11aSdp78419 len = P2ROUNDUP(test_base - (base - offset), 1092ce8eb11aSdp78419 mnode_stride); 1093ce8eb11aSdp78419 nearest = base - offset + len; 1094ce8eb11aSdp78419 /* 1095ce8eb11aSdp78419 * Compute distance from test_base to the 1096ce8eb11aSdp78419 * stride boundary to see if test_base falls 1097ce8eb11aSdp78419 * in the stripe or in the hole. 1098ce8eb11aSdp78419 */ 1099ce8eb11aSdp78419 if (nearest - test_base > hole) { 1100ce8eb11aSdp78419 /* 1101ce8eb11aSdp78419 * test_base lies in stripe, 1102ce8eb11aSdp78419 * and offset should be excluded. 1103ce8eb11aSdp78419 */ 1104ce8eb11aSdp78419 offset = test_base - 1105ce8eb11aSdp78419 (nearest - mnode_stride); 1106ce8eb11aSdp78419 base = test_base; 1107ce8eb11aSdp78419 } else { 1108ce8eb11aSdp78419 /* round up to next stripe start */ 1109ce8eb11aSdp78419 offset = 0; 1110ce8eb11aSdp78419 base = nearest; 1111ce8eb11aSdp78419 if (base > end) 1112ce8eb11aSdp78419 continue; 1113ce8eb11aSdp78419 } 1114ce8eb11aSdp78419 1115ce8eb11aSdp78419 } 1116ce8eb11aSdp78419 1117ce8eb11aSdp78419 if (test_end < end) 1118ce8eb11aSdp78419 end = test_end; 1119ce8eb11aSdp78419 end++; /* adjust to an exclusive bound */ 1120ce8eb11aSdp78419 1121ce8eb11aSdp78419 /* Round end to next multiple of stride */ 1122ce8eb11aSdp78419 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1123ce8eb11aSdp78419 nearest = (base - offset) + len; 1124ce8eb11aSdp78419 if (nearest - end <= hole) { 1125ce8eb11aSdp78419 /* end falls in hole, use entire last stripe */ 1126ce8eb11aSdp78419 frag = 0; 1127ce8eb11aSdp78419 } else { 1128ce8eb11aSdp78419 /* end falls in stripe, compute fragment */ 1129ce8eb11aSdp78419 frag = nearest - hole - end; 1130ce8eb11aSdp78419 } 1131ce8eb11aSdp78419 1132ce8eb11aSdp78419 len = (len >> stripe_shift) - offset - frag; 1133ce8eb11aSdp78419 npages += len; 1134ce8eb11aSdp78419 } 1135ce8eb11aSdp78419 } 1136ce8eb11aSdp78419 1137ce8eb11aSdp78419 *npages_out = npages; 1138ce8eb11aSdp78419 } 1139ce8eb11aSdp78419 1140ce8eb11aSdp78419 /* 1141ce8eb11aSdp78419 * valid_pages() 1142ce8eb11aSdp78419 * 1143ce8eb11aSdp78419 * Return 1 if pages are valid and do not cross mnode boundaries 1144ce8eb11aSdp78419 * (which would break page free list assumptions), and 0 otherwise. 1145ce8eb11aSdp78419 */ 1146ce8eb11aSdp78419 1147ce8eb11aSdp78419 #define MNODE(pa) \ 1148ce8eb11aSdp78419 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1149ce8eb11aSdp78419 1150ce8eb11aSdp78419 static int 1151ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0) 1152ce8eb11aSdp78419 { 1153ce8eb11aSdp78419 int i, max_szc; 1154ce8eb11aSdp78419 uint64_t last_page_base, szc_mask; 1155ce8eb11aSdp78419 uint64_t max_page_len, max_coalesce_len; 1156ce8eb11aSdp78419 struct mblock_md *mb = mpo_mblock; 1157ce8eb11aSdp78419 1158ce8eb11aSdp78419 /* 1159ce8eb11aSdp78419 * Find the smaller of the largest page possible and supported. 1160ce8eb11aSdp78419 * mmu_exported_pagesize_mask is not yet initialized, so read 1161ce8eb11aSdp78419 * it from the MD. Apply minimal fixups in case of broken MDs 1162ce8eb11aSdp78419 * to get a sane mask. 1163ce8eb11aSdp78419 */ 1164ce8eb11aSdp78419 1165ce8eb11aSdp78419 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1166ce8eb11aSdp78419 szc_mask = 0; 1167ce8eb11aSdp78419 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1168ce8eb11aSdp78419 max_szc = highbit(szc_mask) - 1; 1169ce8eb11aSdp78419 if (max_szc > TTE256M) 1170ce8eb11aSdp78419 max_szc = TTE256M; 1171ce8eb11aSdp78419 max_page_len = TTEBYTES(max_szc); 1172ce8eb11aSdp78419 1173ce8eb11aSdp78419 /* 1174ce8eb11aSdp78419 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1175ce8eb11aSdp78419 * if mmu-page-size-list does not contain it, so 256M pages must fall 1176ce8eb11aSdp78419 * within one mnode to use MPO. 1177ce8eb11aSdp78419 */ 1178ce8eb11aSdp78419 max_coalesce_len = TTEBYTES(TTE256M); 1179ce8eb11aSdp78419 ASSERT(max_coalesce_len >= max_page_len); 1180ce8eb11aSdp78419 1181ce8eb11aSdp78419 if (ptob(mnode_pages) < max_coalesce_len) { 1182ce8eb11aSdp78419 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1183ce8eb11aSdp78419 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1184ce8eb11aSdp78419 return (0); 1185ce8eb11aSdp78419 } 1186ce8eb11aSdp78419 1187ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1188ce8eb11aSdp78419 uint64_t base = mb->base; 1189ce8eb11aSdp78419 uint64_t end = mb->base + mb->size - 1; 1190ce8eb11aSdp78419 uint64_t ra_to_pa = mb->ra_to_pa; 1191ce8eb11aSdp78419 1192ce8eb11aSdp78419 /* 1193ce8eb11aSdp78419 * If mblock is smaller than the max page size, then 1194ce8eb11aSdp78419 * RA = PA mod MAXPAGE is not guaranteed, but it must 1195ce8eb11aSdp78419 * not span mnodes. 1196ce8eb11aSdp78419 */ 1197ce8eb11aSdp78419 if (mb->size < max_page_len) { 1198ce8eb11aSdp78419 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1199ce8eb11aSdp78419 MPO_STATUS("Small mblock spans mnodes; " 1200ce8eb11aSdp78419 "MPO disabled: base = %lx, end = %lx, " 1201ce8eb11aSdp78419 "ra2pa = %lx\n", base, end, ra_to_pa); 1202ce8eb11aSdp78419 return (0); 1203ce8eb11aSdp78419 } 1204ce8eb11aSdp78419 } else { 1205ce8eb11aSdp78419 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1206ce8eb11aSdp78419 uint64_t pa_base = base + ra_to_pa; 1207ce8eb11aSdp78419 if ((base & (max_coalesce_len - 1)) != 1208ce8eb11aSdp78419 (pa_base & (max_coalesce_len - 1))) { 1209ce8eb11aSdp78419 MPO_STATUS("bad page alignment; MPO disabled: " 1210ce8eb11aSdp78419 "ra = %lx, pa = %lx, pagelen = %lx\n", 1211ce8eb11aSdp78419 base, pa_base, max_coalesce_len); 1212ce8eb11aSdp78419 return (0); 1213ce8eb11aSdp78419 } 1214ce8eb11aSdp78419 } 1215ce8eb11aSdp78419 1216ce8eb11aSdp78419 /* 1217ce8eb11aSdp78419 * Find start of last large page in mblock in RA space. 1218ce8eb11aSdp78419 * If page extends into the next mblock, verify the 1219ce8eb11aSdp78419 * mnode does not change. 1220ce8eb11aSdp78419 */ 1221ce8eb11aSdp78419 last_page_base = P2ALIGN(end, max_coalesce_len); 1222ce8eb11aSdp78419 if (i + 1 < n_mblocks && 1223ce8eb11aSdp78419 last_page_base + max_coalesce_len > mb[1].base && 1224ce8eb11aSdp78419 MNODE(last_page_base + ra_to_pa) != 1225ce8eb11aSdp78419 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1226ce8eb11aSdp78419 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1227ce8eb11aSdp78419 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1228ce8eb11aSdp78419 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1229ce8eb11aSdp78419 mb[1].ra_to_pa, max_coalesce_len); 1230ce8eb11aSdp78419 return (0); 1231ce8eb11aSdp78419 } 1232ce8eb11aSdp78419 1233ce8eb11aSdp78419 mb++; 1234ce8eb11aSdp78419 } 1235ce8eb11aSdp78419 return (1); 1236ce8eb11aSdp78419 } 1237ce8eb11aSdp78419 1238ce8eb11aSdp78419 1239ce8eb11aSdp78419 /* 1240ce8eb11aSdp78419 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1241ce8eb11aSdp78419 * if any, and remove them. This yields a config where the "coarse 1242ce8eb11aSdp78419 * grained" lgroups cover all of memory, even though part of that memory 1243ce8eb11aSdp78419 * is fine grain interleaved and does not deliver a purely local memory 1244ce8eb11aSdp78419 * latency. 1245ce8eb11aSdp78419 * 1246ce8eb11aSdp78419 * This function reads and modifies the globals: 1247ce8eb11aSdp78419 * mpo_lgroup[], n_lgrpnodes 1248ce8eb11aSdp78419 * 1249ce8eb11aSdp78419 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1250ce8eb11aSdp78419 */ 1251ce8eb11aSdp78419 1252ce8eb11aSdp78419 static int 1253ce8eb11aSdp78419 fix_interleave(void) 1254ce8eb11aSdp78419 { 1255ce8eb11aSdp78419 int i, j; 1256ce8eb11aSdp78419 uint64_t mask = 0; 1257ce8eb11aSdp78419 1258ce8eb11aSdp78419 j = 0; 1259ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 1260ce8eb11aSdp78419 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1261ce8eb11aSdp78419 /* remove this lgroup */ 1262ce8eb11aSdp78419 mask = mpo_lgroup[i].addr_mask; 1263ce8eb11aSdp78419 } else { 1264ce8eb11aSdp78419 mpo_lgroup[j++] = mpo_lgroup[i]; 1265ce8eb11aSdp78419 } 1266ce8eb11aSdp78419 } 1267ce8eb11aSdp78419 n_lgrpnodes = j; 1268ce8eb11aSdp78419 1269ce8eb11aSdp78419 if (mask != 0) 1270ce8eb11aSdp78419 MPO_STATUS("sub-page interleave %lx found; " 1271ce8eb11aSdp78419 "removing lgroup.\n", mask); 1272ce8eb11aSdp78419 1273ce8eb11aSdp78419 return (mask != 0); 1274ce8eb11aSdp78419 } 1275