1ce8eb11aSdp78419 /* 2ce8eb11aSdp78419 * CDDL HEADER START 3ce8eb11aSdp78419 * 4ce8eb11aSdp78419 * The contents of this file are subject to the terms of the 5ce8eb11aSdp78419 * Common Development and Distribution License (the "License"). 6ce8eb11aSdp78419 * You may not use this file except in compliance with the License. 7ce8eb11aSdp78419 * 8ce8eb11aSdp78419 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9ce8eb11aSdp78419 * or http://www.opensolaris.org/os/licensing. 10ce8eb11aSdp78419 * See the License for the specific language governing permissions 11ce8eb11aSdp78419 * and limitations under the License. 12ce8eb11aSdp78419 * 13ce8eb11aSdp78419 * When distributing Covered Code, include this CDDL HEADER in each 14ce8eb11aSdp78419 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15ce8eb11aSdp78419 * If applicable, add the following below this CDDL HEADER, with the 16ce8eb11aSdp78419 * fields enclosed by brackets "[]" replaced with your own identifying 17ce8eb11aSdp78419 * information: Portions Copyright [yyyy] [name of copyright owner] 18ce8eb11aSdp78419 * 19ce8eb11aSdp78419 * CDDL HEADER END 20ce8eb11aSdp78419 */ 21ce8eb11aSdp78419 22ce8eb11aSdp78419 /* 23ce8eb11aSdp78419 * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 24ce8eb11aSdp78419 * Use is subject to license terms. 25ce8eb11aSdp78419 */ 26ce8eb11aSdp78419 27ce8eb11aSdp78419 #pragma ident "%Z%%M% %I% %E% SMI" 28ce8eb11aSdp78419 29ce8eb11aSdp78419 #include <sys/types.h> 30ce8eb11aSdp78419 #include <sys/sysmacros.h> 31ce8eb11aSdp78419 #include <sys/machsystm.h> 32ce8eb11aSdp78419 #include <sys/machparam.h> 33ce8eb11aSdp78419 #include <sys/cmn_err.h> 34ce8eb11aSdp78419 #include <sys/stat.h> 35ce8eb11aSdp78419 #include <sys/mach_descrip.h> 36ce8eb11aSdp78419 #include <sys/memnode.h> 37ce8eb11aSdp78419 #include <sys/mdesc.h> 38ce8eb11aSdp78419 #include <sys/mpo.h> 39ce8eb11aSdp78419 #include <vm/vm_dep.h> 40e853d8c3Sjc25722 #include <vm/hat_sfmmu.h> 41*bb57d1f5Sjc25722 #include <sys/promif.h> 42ce8eb11aSdp78419 43ce8eb11aSdp78419 /* 44ce8eb11aSdp78419 * MPO and the sun4v memory representation 45ce8eb11aSdp78419 * --------------------------------------- 46ce8eb11aSdp78419 * 47ce8eb11aSdp78419 * Latency groups are defined in the sun4v achitecture by memory-latency-group 48ce8eb11aSdp78419 * nodes in the Machine Description, as specified in FWARC/2007/260. These 49ce8eb11aSdp78419 * tie together cpu nodes and mblock nodes, and contain mask and match 50ce8eb11aSdp78419 * properties that identify the portion of an mblock that belongs to the 51ce8eb11aSdp78419 * lgroup. Mask and match are defined in the Physical Address (PA) space, 52ce8eb11aSdp78419 * but an mblock defines Real Addresses (RA). To translate, the mblock 53ce8eb11aSdp78419 * includes the property address-congruence-offset, hereafter referred to as 54ce8eb11aSdp78419 * ra_to_pa. A real address ra is a member of an lgroup if 55ce8eb11aSdp78419 * 56ce8eb11aSdp78419 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 57ce8eb11aSdp78419 * 58ce8eb11aSdp78419 * The MD is traversed, and information on all mblocks is kept in the array 59ce8eb11aSdp78419 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 60ce8eb11aSdp78419 * to, is kept in the array mpo_cpu[]. 61ce8eb11aSdp78419 * 62ce8eb11aSdp78419 * This implementation makes (and verifies) the simplifying assumption that 63ce8eb11aSdp78419 * the mask bits are the same for all defined lgroups, and that all 1 bits in 64ce8eb11aSdp78419 * the mask are contiguous. Thus the number of lgroups is bounded by the 65ce8eb11aSdp78419 * number of possible mask values, and the lgrp_handle_t is defined as the 66ce8eb11aSdp78419 * mask value, shifted right to eliminate the 0 bit positions in mask. The 67ce8eb11aSdp78419 * masks and values are also referred to as "home bits" in the code. 68ce8eb11aSdp78419 * 69ce8eb11aSdp78419 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 70ce8eb11aSdp78419 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 71ce8eb11aSdp78419 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 72ce8eb11aSdp78419 * home bits. This yields the mem_node. 73ce8eb11aSdp78419 * 74ce8eb11aSdp78419 * Interfaces 75ce8eb11aSdp78419 * ---------- 76ce8eb11aSdp78419 * 77ce8eb11aSdp78419 * This file exports the following entry points: 78ce8eb11aSdp78419 * 79ce8eb11aSdp78419 * plat_lgrp_init() 80ce8eb11aSdp78419 * plat_build_mem_nodes() 81ce8eb11aSdp78419 * plat_lgrp_cpu_to_hand() 82ce8eb11aSdp78419 * plat_lgrp_latency() 83ce8eb11aSdp78419 * plat_pfn_to_mem_node() 84ce8eb11aSdp78419 * These implement the usual platform lgroup interfaces. 85ce8eb11aSdp78419 * 86ce8eb11aSdp78419 * plat_rapfn_to_papfn() 87ce8eb11aSdp78419 * Recover the PA page coloring bits from an RA. 88ce8eb11aSdp78419 * 89ce8eb11aSdp78419 * plat_mem_node_iterator_init() 90ce8eb11aSdp78419 * Initialize an iterator to efficiently step through pages in a mem_node. 91ce8eb11aSdp78419 * 92ce8eb11aSdp78419 * plat_mem_node_intersect_range() 93ce8eb11aSdp78419 * Find the intersection with a mem_node. 94ce8eb11aSdp78419 */ 95ce8eb11aSdp78419 96ce8eb11aSdp78419 int sun4v_mpo_enable = 1; 97ce8eb11aSdp78419 int sun4v_mpo_debug = 0; 98ce8eb11aSdp78419 char sun4v_mpo_status[256] = ""; 99ce8eb11aSdp78419 100ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */ 101ce8eb11aSdp78419 static struct cpu_md mpo_cpu[NCPU]; 102ce8eb11aSdp78419 103ce8eb11aSdp78419 /* Save lgroup info from the MD */ 104ce8eb11aSdp78419 #define MAX_MD_LGROUPS 32 105ce8eb11aSdp78419 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 106ce8eb11aSdp78419 static int n_lgrpnodes = 0; 107ce8eb11aSdp78419 static int n_locality_groups = 0; 108ce8eb11aSdp78419 static int max_locality_groups = 0; 109ce8eb11aSdp78419 110ce8eb11aSdp78419 /* Save mblocks from the MD */ 111*bb57d1f5Sjc25722 #define SMALL_MBLOCKS_COUNT 8 112*bb57d1f5Sjc25722 static struct mblock_md *mpo_mblock; 113*bb57d1f5Sjc25722 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 114ce8eb11aSdp78419 static int n_mblocks = 0; 115ce8eb11aSdp78419 116ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */ 117*bb57d1f5Sjc25722 static mem_stripe_t *mem_stripes; 118*bb57d1f5Sjc25722 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 119*bb57d1f5Sjc25722 static int mstripesz = 0; 120ce8eb11aSdp78419 static int n_mem_stripes = 0; 121ce8eb11aSdp78419 static pfn_t mnode_stride; /* distance between stripes, start to start */ 122ce8eb11aSdp78419 static int stripe_shift; /* stride/stripes expressed as a shift */ 123ce8eb11aSdp78419 static pfn_t mnode_pages; /* mem_node stripe width */ 124ce8eb11aSdp78419 125ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */ 126ce8eb11aSdp78419 static uint64_t home_mask = 0; 127ce8eb11aSdp78419 static pfn_t home_mask_pfn = 0; 128ce8eb11aSdp78419 static int home_mask_shift = 0; 129ce8eb11aSdp78419 static uint_t home_mask_pfn_shift = 0; 130ce8eb11aSdp78419 131ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */ 132ce8eb11aSdp78419 static int lower_latency = 0; 133ce8eb11aSdp78419 static int higher_latency = 0; 134ce8eb11aSdp78419 135ce8eb11aSdp78419 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 136ce8eb11aSdp78419 137ce8eb11aSdp78419 static int valid_pages(md_t *md, mde_cookie_t cpu0); 138ce8eb11aSdp78419 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 139ce8eb11aSdp78419 static int fix_interleave(void); 140ce8eb11aSdp78419 141ce8eb11aSdp78419 /* Debug support */ 142ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 143ce8eb11aSdp78419 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 144ce8eb11aSdp78419 #else 145ce8eb11aSdp78419 #define MPO_DEBUG(...) 146ce8eb11aSdp78419 #endif /* DEBUG */ 147ce8eb11aSdp78419 148ce8eb11aSdp78419 /* Record status message, viewable from mdb */ 149ce8eb11aSdp78419 #define MPO_STATUS(args...) { \ 150ce8eb11aSdp78419 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 151ce8eb11aSdp78419 MPO_DEBUG(sun4v_mpo_status); \ 152ce8eb11aSdp78419 } 153ce8eb11aSdp78419 154ce8eb11aSdp78419 /* 155ce8eb11aSdp78419 * Routine to read a uint64_t from a given md 156ce8eb11aSdp78419 */ 157ce8eb11aSdp78419 static int64_t 158ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 159ce8eb11aSdp78419 { 160ce8eb11aSdp78419 int err = md_get_prop_val(md, node, propname, val); 161ce8eb11aSdp78419 return (err); 162ce8eb11aSdp78419 } 163ce8eb11aSdp78419 164ce8eb11aSdp78419 static int 165ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b) 166ce8eb11aSdp78419 { 167ce8eb11aSdp78419 struct mblock_md *m1 = (struct mblock_md *)a; 168ce8eb11aSdp78419 struct mblock_md *m2 = (struct mblock_md *)b; 169ce8eb11aSdp78419 170ce8eb11aSdp78419 if (m1->base < m2->base) 171ce8eb11aSdp78419 return (-1); 172ce8eb11aSdp78419 else if (m1->base == m2->base) 173ce8eb11aSdp78419 return (0); 174ce8eb11aSdp78419 else 175ce8eb11aSdp78419 return (1); 176ce8eb11aSdp78419 } 177ce8eb11aSdp78419 178ce8eb11aSdp78419 static void 179ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n) 180ce8eb11aSdp78419 { 181ce8eb11aSdp78419 extern void qsort(void *, size_t, size_t, 182ce8eb11aSdp78419 int (*)(const void *, const void *)); 183ce8eb11aSdp78419 184ce8eb11aSdp78419 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 185ce8eb11aSdp78419 } 186ce8eb11aSdp78419 187ce8eb11aSdp78419 /* 188ce8eb11aSdp78419 * 189ce8eb11aSdp78419 * Traverse the MD to determine: 190ce8eb11aSdp78419 * 191ce8eb11aSdp78419 * Number of CPU nodes, lgrp_nodes, and mblocks 192ce8eb11aSdp78419 * Then for each lgrp_node, obtain the appropriate data. 193ce8eb11aSdp78419 * For each CPU, determine its home locality and store it. 194ce8eb11aSdp78419 * For each mblock, retrieve its data and store it. 195ce8eb11aSdp78419 */ 196ce8eb11aSdp78419 static int 197ce8eb11aSdp78419 lgrp_traverse(md_t *md) 198ce8eb11aSdp78419 { 199ce8eb11aSdp78419 mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes; 200ce8eb11aSdp78419 uint64_t i, j, k, o, n_nodes; 201ce8eb11aSdp78419 uint64_t n_lgroups = 0; 202ce8eb11aSdp78419 uint64_t mem_lg_homeset = 0; 203ce8eb11aSdp78419 int ret_val = 0; 204ce8eb11aSdp78419 int result = 0; 205ce8eb11aSdp78419 int n_cpunodes = 0; 206ce8eb11aSdp78419 int sub_page_fix; 207*bb57d1f5Sjc25722 int mblocksz = 0; 208*bb57d1f5Sjc25722 size_t allocsz; 209ce8eb11aSdp78419 210ce8eb11aSdp78419 n_nodes = md_node_count(md); 211ce8eb11aSdp78419 212ce8eb11aSdp78419 if (n_nodes <= 0) { 213ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No nodes in node count\n"); 214ce8eb11aSdp78419 ret_val = -1; 215ce8eb11aSdp78419 goto fail; 216ce8eb11aSdp78419 } 217ce8eb11aSdp78419 218ce8eb11aSdp78419 root = md_root_node(md); 219ce8eb11aSdp78419 220ce8eb11aSdp78419 if (root == MDE_INVAL_ELEM_COOKIE) { 221ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: Root node is missing\n"); 222ce8eb11aSdp78419 ret_val = -1; 223ce8eb11aSdp78419 goto fail; 224ce8eb11aSdp78419 } 225ce8eb11aSdp78419 226ce8eb11aSdp78419 /* 227ce8eb11aSdp78419 * Build the Memory Nodes. Do this before any possibility of 228ce8eb11aSdp78419 * bailing from this routine so we obtain ra_to_pa (needed for page 229ce8eb11aSdp78419 * coloring) even when there are no lgroups defined. 230ce8eb11aSdp78419 */ 231ce8eb11aSdp78419 232ce8eb11aSdp78419 n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, 233ce8eb11aSdp78419 "fwd", &mblocknodes); 234ce8eb11aSdp78419 235*bb57d1f5Sjc25722 if (n_mblocks <= 0) { 236ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No mblock " 237ce8eb11aSdp78419 "nodes detected in Machine Descriptor\n"); 238ce8eb11aSdp78419 n_mblocks = 0; 239ce8eb11aSdp78419 ret_val = -1; 240ce8eb11aSdp78419 goto fail; 241ce8eb11aSdp78419 } 242*bb57d1f5Sjc25722 /* 243*bb57d1f5Sjc25722 * If we have a small number of mblocks we will use the space 244*bb57d1f5Sjc25722 * that we preallocated. Otherwise, we will dynamically 245*bb57d1f5Sjc25722 * allocate the space 246*bb57d1f5Sjc25722 */ 247*bb57d1f5Sjc25722 mblocksz = n_mblocks * sizeof (struct mblock_md); 248*bb57d1f5Sjc25722 mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t); 249*bb57d1f5Sjc25722 250*bb57d1f5Sjc25722 if (n_mblocks <= SMALL_MBLOCKS_COUNT) { 251*bb57d1f5Sjc25722 mpo_mblock = &small_mpo_mblocks[0]; 252*bb57d1f5Sjc25722 mem_stripes = &small_mem_stripes[0]; 253*bb57d1f5Sjc25722 } else { 254*bb57d1f5Sjc25722 allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 255*bb57d1f5Sjc25722 /* Ensure that we dont request more space than reserved */ 256*bb57d1f5Sjc25722 if (allocsz > MPOBUF_SIZE) { 257*bb57d1f5Sjc25722 MPO_STATUS("lgrp_traverse: Insufficient space " 258*bb57d1f5Sjc25722 "for mblock structures \n"); 259*bb57d1f5Sjc25722 ret_val = -1; 260*bb57d1f5Sjc25722 n_mblocks = 0; 261*bb57d1f5Sjc25722 goto fail; 262*bb57d1f5Sjc25722 } 263*bb57d1f5Sjc25722 mpo_mblock = (struct mblock_md *) 264*bb57d1f5Sjc25722 prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 265*bb57d1f5Sjc25722 if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) { 266*bb57d1f5Sjc25722 MPO_STATUS("lgrp_traverse: Cannot allocate space " 267*bb57d1f5Sjc25722 "for mblocks \n"); 268*bb57d1f5Sjc25722 ret_val = -1; 269*bb57d1f5Sjc25722 n_mblocks = 0; 270*bb57d1f5Sjc25722 goto fail; 271*bb57d1f5Sjc25722 } 272*bb57d1f5Sjc25722 mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 273*bb57d1f5Sjc25722 mpo_heap32_bufsz = MPOBUF_SIZE; 274*bb57d1f5Sjc25722 275*bb57d1f5Sjc25722 mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks); 276*bb57d1f5Sjc25722 } 277ce8eb11aSdp78419 278ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 279ce8eb11aSdp78419 mpo_mblock[i].node = mblocknodes[i]; 280*bb57d1f5Sjc25722 mpo_mblock[i].mnode_mask = (mnodeset_t)0; 281ce8eb11aSdp78419 282ce8eb11aSdp78419 /* Without a base or size value we will fail */ 283ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_BASE, 284ce8eb11aSdp78419 &mpo_mblock[i].base); 285ce8eb11aSdp78419 if (result < 0) { 286ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 287ce8eb11aSdp78419 "PROP_LG_BASE is missing\n"); 288ce8eb11aSdp78419 n_mblocks = 0; 289ce8eb11aSdp78419 ret_val = -1; 290ce8eb11aSdp78419 goto fail; 291ce8eb11aSdp78419 } 292ce8eb11aSdp78419 293ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], PROP_LG_SIZE, 294ce8eb11aSdp78419 &mpo_mblock[i].size); 295ce8eb11aSdp78419 if (result < 0) { 296ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 297ce8eb11aSdp78419 "PROP_LG_SIZE is missing\n"); 298ce8eb11aSdp78419 n_mblocks = 0; 299ce8eb11aSdp78419 ret_val = -1; 300ce8eb11aSdp78419 goto fail; 301ce8eb11aSdp78419 } 302ce8eb11aSdp78419 303ce8eb11aSdp78419 result = get_int(md, mblocknodes[i], 304ce8eb11aSdp78419 PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa); 305ce8eb11aSdp78419 306ce8eb11aSdp78419 /* If we don't have an ra_pa_offset, just set it to 0 */ 307ce8eb11aSdp78419 if (result < 0) 308ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa = 0; 309ce8eb11aSdp78419 310ce8eb11aSdp78419 MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 311ce8eb11aSdp78419 "ra_to_pa = %lx\n", i, 312ce8eb11aSdp78419 mpo_mblock[i].base, 313ce8eb11aSdp78419 mpo_mblock[i].size, 314ce8eb11aSdp78419 mpo_mblock[i].ra_to_pa); 315ce8eb11aSdp78419 } 316ce8eb11aSdp78419 317ce8eb11aSdp78419 /* Must sort mblocks by address for mem_node_iterator_init() */ 318ce8eb11aSdp78419 mblock_sort(mpo_mblock, n_mblocks); 319ce8eb11aSdp78419 320ce8eb11aSdp78419 base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa); 321ce8eb11aSdp78419 322ce8eb11aSdp78419 /* Page coloring hook is required so we can iterate through mnodes */ 323ce8eb11aSdp78419 if (&page_next_pfn_for_color_cpu == NULL) { 324ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No page coloring support\n"); 325ce8eb11aSdp78419 ret_val = -1; 326ce8eb11aSdp78419 goto fail; 327ce8eb11aSdp78419 } 328ce8eb11aSdp78419 329ce8eb11aSdp78419 /* Global enable for mpo */ 330ce8eb11aSdp78419 if (sun4v_mpo_enable == 0) { 331ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 332ce8eb11aSdp78419 ret_val = -1; 333ce8eb11aSdp78419 goto fail; 334ce8eb11aSdp78419 } 335ce8eb11aSdp78419 336ce8eb11aSdp78419 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 337ce8eb11aSdp78419 "fwd", &lgrpnodes); 338ce8eb11aSdp78419 339ce8eb11aSdp78419 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 340ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No Lgroups\n"); 341ce8eb11aSdp78419 ret_val = -1; 342ce8eb11aSdp78419 goto fail; 343ce8eb11aSdp78419 } 344ce8eb11aSdp78419 345ce8eb11aSdp78419 n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 346ce8eb11aSdp78419 347ce8eb11aSdp78419 if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 348ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: No CPU nodes detected " 349ce8eb11aSdp78419 "in MD\n"); 350ce8eb11aSdp78419 ret_val = -1; 351ce8eb11aSdp78419 goto fail; 352ce8eb11aSdp78419 } 353ce8eb11aSdp78419 354ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes); 355ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: md: %p\n", md); 356ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: root: %lx\n", root); 357ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes); 358ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 359ce8eb11aSdp78419 MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks); 360ce8eb11aSdp78419 361ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 362ce8eb11aSdp78419 mpo_lgroup[i].node = lgrpnodes[i]; 363ce8eb11aSdp78419 mpo_lgroup[i].id = i; 364ce8eb11aSdp78419 mpo_lgroup[i].ncpu = 0; 365ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 366ce8eb11aSdp78419 &mpo_lgroup[i].addr_mask); 367ce8eb11aSdp78419 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 368ce8eb11aSdp78419 &mpo_lgroup[i].addr_match); 369ce8eb11aSdp78419 370ce8eb11aSdp78419 /* 371ce8eb11aSdp78419 * If either the mask or match properties are missing, set to 0 372ce8eb11aSdp78419 */ 373ce8eb11aSdp78419 if (result < 0) { 374ce8eb11aSdp78419 mpo_lgroup[i].addr_mask = 0; 375ce8eb11aSdp78419 mpo_lgroup[i].addr_match = 0; 376ce8eb11aSdp78419 } 377ce8eb11aSdp78419 378ce8eb11aSdp78419 /* Set latency to 0 if property not present */ 379ce8eb11aSdp78419 380ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 381ce8eb11aSdp78419 &mpo_lgroup[i].latency); 382ce8eb11aSdp78419 if (result < 0) 383ce8eb11aSdp78419 mpo_lgroup[i].latency = 0; 384ce8eb11aSdp78419 } 385ce8eb11aSdp78419 386ce8eb11aSdp78419 /* 387ce8eb11aSdp78419 * Sub-page level interleave is not yet supported. Check for it, 388ce8eb11aSdp78419 * and remove sub-page interleaved lgroups from mpo_lgroup and 389ce8eb11aSdp78419 * n_lgrpnodes. If no lgroups are left, return. 390ce8eb11aSdp78419 */ 391ce8eb11aSdp78419 392ce8eb11aSdp78419 sub_page_fix = fix_interleave(); 393ce8eb11aSdp78419 if (n_lgrpnodes == 0) { 394ce8eb11aSdp78419 ret_val = -1; 395ce8eb11aSdp78419 goto fail; 396ce8eb11aSdp78419 } 397ce8eb11aSdp78419 398ce8eb11aSdp78419 /* Ensure that all of the addr_mask values are the same */ 399ce8eb11aSdp78419 400ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 401ce8eb11aSdp78419 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 402ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 403ce8eb11aSdp78419 "addr_mask values are not the same\n"); 404ce8eb11aSdp78419 ret_val = -1; 405ce8eb11aSdp78419 goto fail; 406ce8eb11aSdp78419 } 407ce8eb11aSdp78419 } 408ce8eb11aSdp78419 409ce8eb11aSdp78419 /* 410ce8eb11aSdp78419 * Ensure that all lgrp nodes see all the mblocks. However, if 411ce8eb11aSdp78419 * sub-page interleave is being fixed, they do not, so skip 412ce8eb11aSdp78419 * the check. 413ce8eb11aSdp78419 */ 414ce8eb11aSdp78419 415ce8eb11aSdp78419 if (sub_page_fix == 0) { 416ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 417ce8eb11aSdp78419 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 418ce8eb11aSdp78419 PROP_LG_MBLOCK, "fwd", &nodes); 419ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 420ce8eb11aSdp78419 if (j != n_mblocks) { 421ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 422ce8eb11aSdp78419 "sub-page interleave is being fixed\n"); 423ce8eb11aSdp78419 ret_val = -1; 424ce8eb11aSdp78419 goto fail; 425ce8eb11aSdp78419 } 426ce8eb11aSdp78419 } 427ce8eb11aSdp78419 } 428ce8eb11aSdp78419 429ce8eb11aSdp78419 /* 430ce8eb11aSdp78419 * Use the address mask from the first lgroup node 431ce8eb11aSdp78419 * to establish our home_mask. 432ce8eb11aSdp78419 */ 433ce8eb11aSdp78419 home_mask = mpo_lgroup[0].addr_mask; 434ce8eb11aSdp78419 home_mask_pfn = btop(home_mask); 435ce8eb11aSdp78419 home_mask_shift = lowbit(home_mask) - 1; 436ce8eb11aSdp78419 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 437ce8eb11aSdp78419 mnode_pages = btop(1ULL << home_mask_shift); 438ce8eb11aSdp78419 439ce8eb11aSdp78419 /* 440ce8eb11aSdp78419 * How many values are possible in home mask? Assume the mask 441ce8eb11aSdp78419 * bits are contiguous. 442ce8eb11aSdp78419 */ 443ce8eb11aSdp78419 max_locality_groups = 444ce8eb11aSdp78419 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 445ce8eb11aSdp78419 446ce8eb11aSdp78419 /* Now verify the home mask bits are contiguous */ 447ce8eb11aSdp78419 448ce8eb11aSdp78419 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 449ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 450ce8eb11aSdp78419 "home mask bits are not contiguous\n"); 451ce8eb11aSdp78419 ret_val = -1; 452ce8eb11aSdp78419 goto fail; 453ce8eb11aSdp78419 } 454ce8eb11aSdp78419 455ce8eb11aSdp78419 /* Record all of the home bits */ 456ce8eb11aSdp78419 457ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 458ce8eb11aSdp78419 HOMESET_ADD(mem_lg_homeset, 459ce8eb11aSdp78419 mpo_lgroup[i].addr_match >> home_mask_shift); 460ce8eb11aSdp78419 } 461ce8eb11aSdp78419 462ce8eb11aSdp78419 /* Count the number different "home" mem_lg's we've discovered */ 463ce8eb11aSdp78419 464ce8eb11aSdp78419 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 465ce8eb11aSdp78419 466ce8eb11aSdp78419 /* If we have only 1 locality group then we can exit */ 467ce8eb11aSdp78419 if (n_locality_groups == 1) { 468ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 469ce8eb11aSdp78419 ret_val = -1; 470ce8eb11aSdp78419 goto fail; 471ce8eb11aSdp78419 } 472ce8eb11aSdp78419 473ce8eb11aSdp78419 /* 474ce8eb11aSdp78419 * Set the latencies. A CPU's lgroup is defined by the lowest 475ce8eb11aSdp78419 * latency found. All other memory is considered remote, and the 476ce8eb11aSdp78419 * remote latency is represented by the highest latency found. 477ce8eb11aSdp78419 * Thus hierarchical lgroups, if any, are approximated by a 478ce8eb11aSdp78419 * two level scheme. 479ce8eb11aSdp78419 * 480ce8eb11aSdp78419 * The Solaris MPO framework by convention wants to see latencies 481ce8eb11aSdp78419 * in units of nano-sec/10. In the MD, the units are defined to be 482ce8eb11aSdp78419 * pico-seconds. 483ce8eb11aSdp78419 */ 484ce8eb11aSdp78419 485ce8eb11aSdp78419 lower_latency = mpo_lgroup[0].latency; 486ce8eb11aSdp78419 higher_latency = mpo_lgroup[0].latency; 487ce8eb11aSdp78419 488ce8eb11aSdp78419 for (i = 1; i < n_lgrpnodes; i++) { 489ce8eb11aSdp78419 if (mpo_lgroup[i].latency < lower_latency) { 490ce8eb11aSdp78419 lower_latency = mpo_lgroup[i].latency; 491ce8eb11aSdp78419 } 492ce8eb11aSdp78419 if (mpo_lgroup[i].latency > higher_latency) { 493ce8eb11aSdp78419 higher_latency = mpo_lgroup[i].latency; 494ce8eb11aSdp78419 } 495ce8eb11aSdp78419 } 496ce8eb11aSdp78419 lower_latency /= 10000; 497ce8eb11aSdp78419 higher_latency /= 10000; 498ce8eb11aSdp78419 499ce8eb11aSdp78419 /* Clear our CPU data */ 500ce8eb11aSdp78419 501ce8eb11aSdp78419 for (i = 0; i < NCPU; i++) { 502ce8eb11aSdp78419 mpo_cpu[i].home = 0; 503ce8eb11aSdp78419 mpo_cpu[i].latency = (uint_t)(-1); 504ce8eb11aSdp78419 } 505ce8eb11aSdp78419 506ce8eb11aSdp78419 /* Build the CPU nodes */ 507ce8eb11aSdp78419 for (i = 0; i < n_cpunodes; i++) { 508ce8eb11aSdp78419 509ce8eb11aSdp78419 /* Read in the lgroup nodes */ 510ce8eb11aSdp78419 511ce8eb11aSdp78419 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 512ce8eb11aSdp78419 if (result < 0) { 513ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 514ce8eb11aSdp78419 ret_val = -1; 515ce8eb11aSdp78419 goto fail; 516ce8eb11aSdp78419 } 517ce8eb11aSdp78419 518ce8eb11aSdp78419 n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG, 519ce8eb11aSdp78419 "fwd", &nodes); 520ce8eb11aSdp78419 if (n_lgroups <= 0) { 521ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing"); 522ce8eb11aSdp78419 ret_val = -1; 523ce8eb11aSdp78419 goto fail; 524ce8eb11aSdp78419 } 525ce8eb11aSdp78419 526ce8eb11aSdp78419 /* 527ce8eb11aSdp78419 * Find the lgroup this cpu belongs to with the lowest latency. 528ce8eb11aSdp78419 * Check all the lgrp nodes connected to this CPU to determine 529ce8eb11aSdp78419 * which has the smallest latency. 530ce8eb11aSdp78419 */ 531ce8eb11aSdp78419 532ce8eb11aSdp78419 for (j = 0; j < n_lgroups; j++) { 533ce8eb11aSdp78419 for (o = 0; o < n_lgrpnodes; o++) { 534ce8eb11aSdp78419 if (nodes[j] == mpo_lgroup[o].node) { 535ce8eb11aSdp78419 if (mpo_lgroup[o].latency < 536ce8eb11aSdp78419 mpo_cpu[k].latency) { 537ce8eb11aSdp78419 mpo_cpu[k].home = 538ce8eb11aSdp78419 mpo_lgroup[o].addr_match 539ce8eb11aSdp78419 >> home_mask_shift; 540ce8eb11aSdp78419 mpo_cpu[k].latency = 541ce8eb11aSdp78419 mpo_lgroup[o].latency; 542ce8eb11aSdp78419 mpo_lgroup[o].ncpu++; 543ce8eb11aSdp78419 } 544ce8eb11aSdp78419 } 545ce8eb11aSdp78419 } 546ce8eb11aSdp78419 } 547ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 548ce8eb11aSdp78419 } 549ce8eb11aSdp78419 550ce8eb11aSdp78419 /* Validate that no large pages cross mnode boundaries. */ 551ce8eb11aSdp78419 if (valid_pages(md, cpunodes[0]) == 0) { 552ce8eb11aSdp78419 ret_val = -1; 553ce8eb11aSdp78419 goto fail; 554ce8eb11aSdp78419 } 555ce8eb11aSdp78419 556ce8eb11aSdp78419 fail: 557ce8eb11aSdp78419 /* MD cookies are no longer valid; ensure they are not used again. */ 558ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) 559ce8eb11aSdp78419 mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE; 560ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) 561ce8eb11aSdp78419 mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 562ce8eb11aSdp78419 563ce8eb11aSdp78419 if (n_cpunodes > 0) 564ce8eb11aSdp78419 md_free_scan_dag(md, &cpunodes); 565ce8eb11aSdp78419 if (n_lgrpnodes > 0) 566ce8eb11aSdp78419 md_free_scan_dag(md, &lgrpnodes); 567ce8eb11aSdp78419 if (n_mblocks > 0) 568ce8eb11aSdp78419 md_free_scan_dag(md, &mblocknodes); 569ce8eb11aSdp78419 else 570ce8eb11aSdp78419 panic("lgrp_traverse: No memory blocks found"); 571ce8eb11aSdp78419 572ce8eb11aSdp78419 if (ret_val == 0) 573ce8eb11aSdp78419 MPO_STATUS("MPO feature is enabled.\n"); 574ce8eb11aSdp78419 575ce8eb11aSdp78419 return (ret_val); 576ce8eb11aSdp78419 } 577ce8eb11aSdp78419 578ce8eb11aSdp78419 /* 579ce8eb11aSdp78419 * Determine the number of unique mem_lg's present in our system 580ce8eb11aSdp78419 */ 581ce8eb11aSdp78419 static int 582ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 583ce8eb11aSdp78419 { 584ce8eb11aSdp78419 int homeid; 585ce8eb11aSdp78419 int count = 0; 586ce8eb11aSdp78419 587ce8eb11aSdp78419 /* 588ce8eb11aSdp78419 * Scan the "home" bits of the mem_lgs, count 589ce8eb11aSdp78419 * the number that are unique. 590ce8eb11aSdp78419 */ 591ce8eb11aSdp78419 592ce8eb11aSdp78419 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 593ce8eb11aSdp78419 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 594ce8eb11aSdp78419 count++; 595ce8eb11aSdp78419 } 596ce8eb11aSdp78419 } 597ce8eb11aSdp78419 598ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 599ce8eb11aSdp78419 mem_lg_homeset); 600ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 601ce8eb11aSdp78419 602ce8eb11aSdp78419 /* Default must be at least one */ 603ce8eb11aSdp78419 if (count == 0) 604ce8eb11aSdp78419 count = 1; 605ce8eb11aSdp78419 606ce8eb11aSdp78419 return (count); 607ce8eb11aSdp78419 } 608ce8eb11aSdp78419 609ce8eb11aSdp78419 /* 610ce8eb11aSdp78419 * Platform specific lgroup initialization 611ce8eb11aSdp78419 */ 612ce8eb11aSdp78419 void 613ce8eb11aSdp78419 plat_lgrp_init(void) 614ce8eb11aSdp78419 { 615ce8eb11aSdp78419 md_t *md; 616ce8eb11aSdp78419 int i, rc, ncpu_min; 617ce8eb11aSdp78419 618ce8eb11aSdp78419 /* Get the Machine Descriptor handle */ 619ce8eb11aSdp78419 620ce8eb11aSdp78419 md = md_get_handle(); 621ce8eb11aSdp78419 622ce8eb11aSdp78419 /* If not, we cannot continue */ 623ce8eb11aSdp78419 624ce8eb11aSdp78419 if (md == NULL) { 625ce8eb11aSdp78419 panic("cannot access machine descriptor\n"); 626ce8eb11aSdp78419 } else { 627ce8eb11aSdp78419 rc = lgrp_traverse(md); 628ce8eb11aSdp78419 (void) md_fini_handle(md); 629ce8eb11aSdp78419 } 630ce8eb11aSdp78419 631ce8eb11aSdp78419 /* 632ce8eb11aSdp78419 * If we can't process the MD for lgroups then at least let the 633ce8eb11aSdp78419 * system try to boot. Assume we have one lgroup so that 634ce8eb11aSdp78419 * when plat_build_mem_nodes is called, it will attempt to init 635ce8eb11aSdp78419 * an mnode based on the supplied memory segment. 636ce8eb11aSdp78419 */ 637ce8eb11aSdp78419 638ce8eb11aSdp78419 if (rc == -1) { 639ce8eb11aSdp78419 home_mask_pfn = 0; 640ce8eb11aSdp78419 max_locality_groups = 1; 641ce8eb11aSdp78419 n_locality_groups = 1; 642ce8eb11aSdp78419 return; 643ce8eb11aSdp78419 } 644ce8eb11aSdp78419 645ce8eb11aSdp78419 mem_node_pfn_shift = 0; 646ce8eb11aSdp78419 mem_node_physalign = 0; 647ce8eb11aSdp78419 648ce8eb11aSdp78419 /* Use lgroup-aware TSB allocations */ 649ce8eb11aSdp78419 tsb_lgrp_affinity = 1; 650ce8eb11aSdp78419 651ce8eb11aSdp78419 /* 652ce8eb11aSdp78419 * lgrp_expand_proc_thresh is the minimum load on the lgroups 653ce8eb11aSdp78419 * this process is currently running on before considering 654ce8eb11aSdp78419 * expanding threads to another lgroup. 655ce8eb11aSdp78419 * 656ce8eb11aSdp78419 * lgrp_expand_proc_diff determines how much less the remote lgroup 657ce8eb11aSdp78419 * must be loaded before expanding to it. 658ce8eb11aSdp78419 * 659ce8eb11aSdp78419 * On sun4v CMT processors, threads share a core pipeline, and 660ce8eb11aSdp78419 * at less than 100% utilization, best throughput is obtained by 661ce8eb11aSdp78419 * spreading threads across more cores, even if some are in a 662ce8eb11aSdp78419 * different lgroup. Spread threads to a new lgroup if the 663ce8eb11aSdp78419 * current group is more than 50% loaded. Because of virtualization, 664ce8eb11aSdp78419 * lgroups may have different numbers of CPUs, but the tunables 665ce8eb11aSdp78419 * apply to all lgroups, so find the smallest lgroup and compute 666ce8eb11aSdp78419 * 50% loading. 667ce8eb11aSdp78419 */ 668ce8eb11aSdp78419 669ce8eb11aSdp78419 ncpu_min = NCPU; 670ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 671ce8eb11aSdp78419 int ncpu = mpo_lgroup[i].ncpu; 672ce8eb11aSdp78419 if (ncpu != 0 && ncpu < ncpu_min) 673ce8eb11aSdp78419 ncpu_min = ncpu; 674ce8eb11aSdp78419 } 675ce8eb11aSdp78419 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 676ce8eb11aSdp78419 677ce8eb11aSdp78419 /* new home may only be half as loaded as the existing home to use it */ 678ce8eb11aSdp78419 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 679ce8eb11aSdp78419 680ce8eb11aSdp78419 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 681ce8eb11aSdp78419 682ce8eb11aSdp78419 /* Require that a home lgroup have some memory to be chosen */ 683ce8eb11aSdp78419 lgrp_mem_free_thresh = 1; 684ce8eb11aSdp78419 685ce8eb11aSdp78419 /* Standard home-on-next-touch policy */ 686ce8eb11aSdp78419 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 687ce8eb11aSdp78419 688ce8eb11aSdp78419 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 689ce8eb11aSdp78419 lgrp_load_thresh = UINT32_MAX; 690ce8eb11aSdp78419 } 691ce8eb11aSdp78419 692ce8eb11aSdp78419 /* 693ce8eb11aSdp78419 * Helper routine for debugging calls to mem_node_add_slice() 694ce8eb11aSdp78419 */ 695ce8eb11aSdp78419 static void 696ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 697ce8eb11aSdp78419 { 698ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 699ce8eb11aSdp78419 static int slice_count = 0; 700ce8eb11aSdp78419 701ce8eb11aSdp78419 slice_count++; 702ce8eb11aSdp78419 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 703ce8eb11aSdp78419 slice_count, basepfn, endpfn); 704ce8eb11aSdp78419 #endif 705ce8eb11aSdp78419 mem_node_add_slice(basepfn, endpfn); 706ce8eb11aSdp78419 } 707ce8eb11aSdp78419 708ce8eb11aSdp78419 /* 709ce8eb11aSdp78419 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 710ce8eb11aSdp78419 */ 711ce8eb11aSdp78419 static void 712ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 713ce8eb11aSdp78419 { 714ce8eb11aSdp78419 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld," 715ce8eb11aSdp78419 "mnode index: %d\n", plathand, mnode); 716ce8eb11aSdp78419 plat_assign_lgrphand_to_mem_node(plathand, mnode); 717ce8eb11aSdp78419 } 718ce8eb11aSdp78419 719ce8eb11aSdp78419 /* 720ce8eb11aSdp78419 * plat_build_mem_nodes() 721ce8eb11aSdp78419 * 722ce8eb11aSdp78419 * Define the mem_nodes based on the modified boot memory list, 723ce8eb11aSdp78419 * or based on info read from the MD in plat_lgrp_init(). 724ce8eb11aSdp78419 * 725ce8eb11aSdp78419 * When the home mask lies in the middle of the address bits (as it does on 726ce8eb11aSdp78419 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 727ce8eb11aSdp78419 * it is striped across an mblock in a repeating pattern of contiguous memory 728ce8eb11aSdp78419 * followed by a gap. The stripe width is the size of the contiguous piece. 729ce8eb11aSdp78419 * The stride is the distance from the start of one contiguous piece to the 730ce8eb11aSdp78419 * start of the next. The gap is thus stride - stripe_width. 731ce8eb11aSdp78419 * 732ce8eb11aSdp78419 * The stripe of an mnode that falls within an mblock is described by the type 733ce8eb11aSdp78419 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 734ce8eb11aSdp78419 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 735ce8eb11aSdp78419 * this array is predetermined. The mem_stripe_t that describes mnode m 736ce8eb11aSdp78419 * within mpo_mblock[i] is stored at 737ce8eb11aSdp78419 * mem_stripes[ m + i * max_locality_groups ] 738ce8eb11aSdp78419 * 739ce8eb11aSdp78419 * max_locality_groups is the total number of possible locality groups, 740ce8eb11aSdp78419 * as defined by the size of the home mask, even if the memory assigned 741ce8eb11aSdp78419 * to the domain is small and does not cover all the lgroups. Thus some 742ce8eb11aSdp78419 * mem_stripe_t's may be empty. 743ce8eb11aSdp78419 * 744ce8eb11aSdp78419 * The members of mem_stripe_t are: 745ce8eb11aSdp78419 * physbase: First valid page in mem_node in the corresponding mblock 746ce8eb11aSdp78419 * physmax: Last valid page in mem_node in mblock 747ce8eb11aSdp78419 * offset: The full stripe width starts at physbase - offset. 748ce8eb11aSdp78419 * Thus if offset is non-zero, this mem_node starts in the middle 749ce8eb11aSdp78419 * of a stripe width, and the second full stripe starts at 750ce8eb11aSdp78419 * physbase - offset + stride. (even though physmax may fall in the 751ce8eb11aSdp78419 * middle of a stripe width, we do not save the ending fragment size 752ce8eb11aSdp78419 * in this data structure.) 753ce8eb11aSdp78419 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 754ce8eb11aSdp78419 * 755ce8eb11aSdp78419 * The stripe width is kept in the global mnode_pages. 756ce8eb11aSdp78419 * The stride is kept in the global mnode_stride. 757ce8eb11aSdp78419 * All the above use pfn's as the unit. 758ce8eb11aSdp78419 * 759ce8eb11aSdp78419 * As an example, the memory layout for a domain with 2 mblocks and 4 760ce8eb11aSdp78419 * mem_nodes 0,1,2,3 could look like this: 761ce8eb11aSdp78419 * 762ce8eb11aSdp78419 * 123012301230 ... 012301230123 ... 763ce8eb11aSdp78419 * mblock 0 mblock 1 764ce8eb11aSdp78419 */ 765ce8eb11aSdp78419 766ce8eb11aSdp78419 void 767ce8eb11aSdp78419 plat_build_mem_nodes(u_longlong_t *list, size_t nelems) 768ce8eb11aSdp78419 { 769ce8eb11aSdp78419 lgrp_handle_t lgrphand, lgrp_start; 770ce8eb11aSdp78419 int i, mnode, elem; 771ce8eb11aSdp78419 uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride; 772ce8eb11aSdp78419 uint64_t stripe, frag, remove; 773ce8eb11aSdp78419 mem_stripe_t *ms; 774ce8eb11aSdp78419 775e853d8c3Sjc25722 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 776e853d8c3Sjc25722 max_mem_nodes = max_locality_groups; 777ce8eb11aSdp78419 778e853d8c3Sjc25722 /* Check for non-MPO sun4v platforms */ 779ce8eb11aSdp78419 if (n_locality_groups <= 1) { 780e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 781ce8eb11aSdp78419 for (elem = 0; elem < nelems; elem += 2) { 782ce8eb11aSdp78419 base = list[elem]; 783ce8eb11aSdp78419 len = list[elem+1]; 784ce8eb11aSdp78419 785ce8eb11aSdp78419 mpo_mem_node_add_slice(btop(base), 786ce8eb11aSdp78419 btop(base + len - 1)); 787ce8eb11aSdp78419 } 788ce8eb11aSdp78419 mem_node_pfn_shift = 0; 789ce8eb11aSdp78419 mem_node_physalign = 0; 790ce8eb11aSdp78419 n_mem_stripes = 0; 791e853d8c3Sjc25722 if (n_mblocks == 1) 792ce8eb11aSdp78419 return; 793ce8eb11aSdp78419 } 794ce8eb11aSdp78419 795*bb57d1f5Sjc25722 bzero(mem_stripes, mstripesz); 796ce8eb11aSdp78419 stripe = ptob(mnode_pages); 797ce8eb11aSdp78419 stride = max_locality_groups * stripe; 798ce8eb11aSdp78419 799ce8eb11aSdp78419 /* Save commonly used values in globals */ 800ce8eb11aSdp78419 mnode_stride = btop(stride); 801ce8eb11aSdp78419 n_mem_stripes = max_locality_groups * n_mblocks; 802ce8eb11aSdp78419 stripe_shift = highbit(max_locality_groups) - 1; 803ce8eb11aSdp78419 804ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 805ce8eb11aSdp78419 806ce8eb11aSdp78419 base = mpo_mblock[i].base; 807ce8eb11aSdp78419 end = mpo_mblock[i].base + mpo_mblock[i].size; 808ce8eb11aSdp78419 ra_to_pa = mpo_mblock[i].ra_to_pa; 809ce8eb11aSdp78419 mpo_mblock[i].base_pfn = btop(base); 810ce8eb11aSdp78419 mpo_mblock[i].end_pfn = btop(end - 1); 811ce8eb11aSdp78419 812ce8eb11aSdp78419 /* Find the offset from the prev stripe boundary in PA space. */ 813ce8eb11aSdp78419 offset = (base + ra_to_pa) & (stripe - 1); 814ce8eb11aSdp78419 815ce8eb11aSdp78419 /* Set the next stripe boundary. */ 816ce8eb11aSdp78419 stripe_end = base - offset + stripe; 817ce8eb11aSdp78419 818ce8eb11aSdp78419 lgrp_start = (((base + ra_to_pa) & home_mask) >> 819ce8eb11aSdp78419 home_mask_shift); 820ce8eb11aSdp78419 lgrphand = lgrp_start; 821ce8eb11aSdp78419 822ce8eb11aSdp78419 /* 823ce8eb11aSdp78419 * Loop over all lgroups covered by the mblock, creating a 824ce8eb11aSdp78419 * stripe for each. Stop when lgrp_start is visited again. 825ce8eb11aSdp78419 */ 826ce8eb11aSdp78419 do { 827ce8eb11aSdp78419 /* mblock may not span all lgroups */ 828ce8eb11aSdp78419 if (base >= end) 829ce8eb11aSdp78419 break; 830ce8eb11aSdp78419 831ce8eb11aSdp78419 mnode = lgrphand; 832ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 833*bb57d1f5Sjc25722 mpo_mblock[i].mnode_mask |= (mnodeset_t)1 << mnode; 834ce8eb11aSdp78419 835ce8eb11aSdp78419 /* 836ce8eb11aSdp78419 * Calculate the size of the fragment that does not 837ce8eb11aSdp78419 * belong to the mnode in the last partial stride. 838ce8eb11aSdp78419 */ 839ce8eb11aSdp78419 frag = (end - (base - offset)) & (stride - 1); 840ce8eb11aSdp78419 if (frag == 0) { 841ce8eb11aSdp78419 /* remove the gap */ 842ce8eb11aSdp78419 remove = stride - stripe; 843ce8eb11aSdp78419 } else if (frag < stripe) { 844ce8eb11aSdp78419 /* fragment fits in stripe; keep it all */ 845ce8eb11aSdp78419 remove = 0; 846ce8eb11aSdp78419 } else { 847ce8eb11aSdp78419 /* fragment is large; trim after whole stripe */ 848ce8eb11aSdp78419 remove = frag - stripe; 849ce8eb11aSdp78419 } 850ce8eb11aSdp78419 851ce8eb11aSdp78419 ms = &mem_stripes[i * max_locality_groups + mnode]; 852ce8eb11aSdp78419 ms->physbase = btop(base); 853ce8eb11aSdp78419 ms->physmax = btop(end - 1 - remove); 854ce8eb11aSdp78419 ms->offset = btop(offset); 855ce8eb11aSdp78419 ms->exists = 1; 856ce8eb11aSdp78419 857e853d8c3Sjc25722 /* 858e853d8c3Sjc25722 * If we have only 1 lgroup and multiple mblocks, 859e853d8c3Sjc25722 * then we have already established our lgrp handle 860e853d8c3Sjc25722 * to mem_node and mem_node_config values above. 861e853d8c3Sjc25722 */ 862e853d8c3Sjc25722 if (n_locality_groups > 1) { 863e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(lgrphand, 864e853d8c3Sjc25722 mnode); 865e853d8c3Sjc25722 mpo_mem_node_add_slice(ms->physbase, 866e853d8c3Sjc25722 ms->physmax); 867e853d8c3Sjc25722 } 868ce8eb11aSdp78419 base = stripe_end; 869ce8eb11aSdp78419 stripe_end += stripe; 870ce8eb11aSdp78419 offset = 0; 871ce8eb11aSdp78419 lgrphand = (((base + ra_to_pa) & home_mask) >> 872ce8eb11aSdp78419 home_mask_shift); 873ce8eb11aSdp78419 } while (lgrphand != lgrp_start); 874ce8eb11aSdp78419 } 875ce8eb11aSdp78419 876ce8eb11aSdp78419 /* 877ce8eb11aSdp78419 * Indicate to vm_pagelist that the hpm_counters array 878ce8eb11aSdp78419 * should be shared because the ranges overlap. 879ce8eb11aSdp78419 */ 880ce8eb11aSdp78419 if (max_mem_nodes > 1) { 881ce8eb11aSdp78419 interleaved_mnodes = 1; 882ce8eb11aSdp78419 } 883ce8eb11aSdp78419 } 884ce8eb11aSdp78419 885ce8eb11aSdp78419 /* 886ce8eb11aSdp78419 * Return the locality group value for the supplied processor 887ce8eb11aSdp78419 */ 888ce8eb11aSdp78419 lgrp_handle_t 889ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id) 890ce8eb11aSdp78419 { 891ce8eb11aSdp78419 if (n_locality_groups > 1) { 892ce8eb11aSdp78419 return ((lgrp_handle_t)mpo_cpu[(int)id].home); 893ce8eb11aSdp78419 } else { 894e853d8c3Sjc25722 return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */ 895ce8eb11aSdp78419 } 896ce8eb11aSdp78419 } 897ce8eb11aSdp78419 898ce8eb11aSdp78419 int 899ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 900ce8eb11aSdp78419 { 901ce8eb11aSdp78419 /* 902ce8eb11aSdp78419 * Return min remote latency when there are more than two lgroups 903ce8eb11aSdp78419 * (root and child) and getting latency between two different lgroups 904ce8eb11aSdp78419 * or root is involved. 905ce8eb11aSdp78419 */ 906ce8eb11aSdp78419 if (lgrp_optimizations() && (from != to || 907ce8eb11aSdp78419 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 908ce8eb11aSdp78419 return ((int)higher_latency); 909ce8eb11aSdp78419 } else { 910ce8eb11aSdp78419 return ((int)lower_latency); 911ce8eb11aSdp78419 } 912ce8eb11aSdp78419 } 913ce8eb11aSdp78419 914ce8eb11aSdp78419 int 915ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn) 916ce8eb11aSdp78419 { 917ce8eb11aSdp78419 int i, mnode; 918ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 919ce8eb11aSdp78419 struct mblock_md *mb; 920ce8eb11aSdp78419 921ce8eb11aSdp78419 if (n_locality_groups <= 1) 922ce8eb11aSdp78419 return (0); 923ce8eb11aSdp78419 924ce8eb11aSdp78419 /* 925ce8eb11aSdp78419 * The mnode is defined to be 1:1 with the lgroup handle, which 926ce8eb11aSdp78419 * is taken from from the home bits. Find the mblock in which 927ce8eb11aSdp78419 * the pfn falls to get the ra_to_pa adjustment, and extract 928ce8eb11aSdp78419 * the home bits. 929ce8eb11aSdp78419 */ 930ce8eb11aSdp78419 mb = &mpo_mblock[0]; 931ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 932ce8eb11aSdp78419 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 933ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 934ce8eb11aSdp78419 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 935ce8eb11aSdp78419 home_mask_pfn_shift); 936ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 937ce8eb11aSdp78419 return (mnode); 938ce8eb11aSdp78419 } 939ce8eb11aSdp78419 mb++; 940ce8eb11aSdp78419 } 941ce8eb11aSdp78419 942ce8eb11aSdp78419 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 943ce8eb11aSdp78419 return (pfn); 944ce8eb11aSdp78419 } 945ce8eb11aSdp78419 946ce8eb11aSdp78419 /* 947ce8eb11aSdp78419 * plat_rapfn_to_papfn 948ce8eb11aSdp78419 * 949ce8eb11aSdp78419 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 950ce8eb11aSdp78419 * and home mask bits are correct. The upper bits do not necessarily 951ce8eb11aSdp78419 * match the actual PA, however. 952ce8eb11aSdp78419 */ 953ce8eb11aSdp78419 pfn_t 954ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn) 955ce8eb11aSdp78419 { 956ce8eb11aSdp78419 int i; 957ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 958ce8eb11aSdp78419 struct mblock_md *mb; 959ce8eb11aSdp78419 960ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 961ce8eb11aSdp78419 if (n_mblocks == 1) 962ce8eb11aSdp78419 return (pfn + base_ra_to_pa_pfn); 963ce8eb11aSdp78419 964ce8eb11aSdp78419 /* 965ce8eb11aSdp78419 * Find the mblock in which the pfn falls 966ce8eb11aSdp78419 * in order to get the ra_to_pa adjustment. 967ce8eb11aSdp78419 */ 968ce8eb11aSdp78419 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 969ce8eb11aSdp78419 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 970ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 971ce8eb11aSdp78419 return (pfn + ra_to_pa_pfn); 972ce8eb11aSdp78419 } 973ce8eb11aSdp78419 } 974ce8eb11aSdp78419 975ce8eb11aSdp78419 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 976ce8eb11aSdp78419 return (pfn); 977ce8eb11aSdp78419 } 978ce8eb11aSdp78419 979ce8eb11aSdp78419 /* 980ce8eb11aSdp78419 * plat_mem_node_iterator_init() 981ce8eb11aSdp78419 * Initialize cookie to iterate over pfn's in an mnode. There is 982ce8eb11aSdp78419 * no additional iterator function. The caller uses the info from 983ce8eb11aSdp78419 * the iterator structure directly. 984ce8eb11aSdp78419 * 985ce8eb11aSdp78419 * pfn: starting pfn. 986ce8eb11aSdp78419 * mnode: desired mnode. 987ce8eb11aSdp78419 * init: set to 1 for full init, 0 for continuation 988ce8eb11aSdp78419 * 989ce8eb11aSdp78419 * Returns the appropriate starting pfn for the iteration 990ce8eb11aSdp78419 * the same as the input pfn if it falls in an mblock. 991ce8eb11aSdp78419 * Returns the (pfn_t)-1 value if the input pfn lies past 992ce8eb11aSdp78419 * the last valid mnode pfn. 993ce8eb11aSdp78419 */ 994ce8eb11aSdp78419 pfn_t 995ce8eb11aSdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode, 996ce8eb11aSdp78419 mem_node_iterator_t *it, int init) 997ce8eb11aSdp78419 { 998ce8eb11aSdp78419 int i; 999ce8eb11aSdp78419 struct mblock_md *mblock; 1000ce8eb11aSdp78419 pfn_t base, end; 1001ce8eb11aSdp78419 1002ce8eb11aSdp78419 ASSERT(it != NULL); 1003ce8eb11aSdp78419 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1004ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 1005ce8eb11aSdp78419 1006ce8eb11aSdp78419 if (init) { 1007ce8eb11aSdp78419 it->mi_last_mblock = 0; 1008ce8eb11aSdp78419 it->mi_init = 1; 1009ce8eb11aSdp78419 } 1010ce8eb11aSdp78419 1011ce8eb11aSdp78419 /* Check if mpo is not enabled and we only have one mblock */ 1012ce8eb11aSdp78419 if (n_locality_groups == 1 && n_mblocks == 1) { 1013ce8eb11aSdp78419 it->mi_mnode = mnode; 1014ce8eb11aSdp78419 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1015ce8eb11aSdp78419 it->mi_mnode_pfn_mask = 0; 1016ce8eb11aSdp78419 it->mi_mnode_pfn_shift = 0; 1017ce8eb11aSdp78419 it->mi_mnode_mask = 0; 1018ce8eb11aSdp78419 it->mi_mblock_base = mem_node_config[mnode].physbase; 1019ce8eb11aSdp78419 it->mi_mblock_end = mem_node_config[mnode].physmax; 1020ce8eb11aSdp78419 if (pfn < it->mi_mblock_base) 1021ce8eb11aSdp78419 pfn = it->mi_mblock_base; 1022ce8eb11aSdp78419 else if (pfn > it->mi_mblock_end) 1023ce8eb11aSdp78419 pfn = (pfn_t)-1; 1024ce8eb11aSdp78419 return (pfn); 1025ce8eb11aSdp78419 } 1026ce8eb11aSdp78419 1027ce8eb11aSdp78419 /* 1028ce8eb11aSdp78419 * Find mblock that contains pfn, or first mblock after pfn, 1029ce8eb11aSdp78419 * else pfn is out of bounds, so use the last mblock. 1030ce8eb11aSdp78419 * mblocks are sorted in ascending address order. 1031ce8eb11aSdp78419 */ 1032ce8eb11aSdp78419 ASSERT(it->mi_last_mblock < n_mblocks); 1033ce8eb11aSdp78419 ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn); 1034ce8eb11aSdp78419 i = init ? 0 : it->mi_last_mblock + 1; 1035ce8eb11aSdp78419 if (i == n_mblocks) 1036ce8eb11aSdp78419 return ((pfn_t)-1); 1037ce8eb11aSdp78419 1038ce8eb11aSdp78419 for (; i < n_mblocks; i++) { 1039*bb57d1f5Sjc25722 if ((mpo_mblock[i].mnode_mask & ((mnodeset_t)1 << mnode)) && 1040*bb57d1f5Sjc25722 (pfn <= mpo_mblock[i].end_pfn)) 1041ce8eb11aSdp78419 break; 1042ce8eb11aSdp78419 } 1043ce8eb11aSdp78419 if (i == n_mblocks) { 1044ce8eb11aSdp78419 it->mi_last_mblock = i - 1; 1045ce8eb11aSdp78419 return ((pfn_t)-1); 1046ce8eb11aSdp78419 } 1047ce8eb11aSdp78419 it->mi_last_mblock = i; 1048ce8eb11aSdp78419 1049ce8eb11aSdp78419 /* 1050ce8eb11aSdp78419 * Memory stripes are defined if there is more than one locality 1051ce8eb11aSdp78419 * group, so use the stripe bounds. Otherwise use mblock bounds. 1052ce8eb11aSdp78419 */ 1053ce8eb11aSdp78419 mblock = &mpo_mblock[i]; 1054ce8eb11aSdp78419 if (n_mem_stripes > 0) { 1055ce8eb11aSdp78419 mem_stripe_t *ms = 1056ce8eb11aSdp78419 &mem_stripes[i * max_locality_groups + mnode]; 1057ce8eb11aSdp78419 base = ms->physbase; 1058ce8eb11aSdp78419 end = ms->physmax; 1059ce8eb11aSdp78419 } else { 1060ce8eb11aSdp78419 ASSERT(mnode == 0); 1061ce8eb11aSdp78419 base = mblock->base_pfn; 1062ce8eb11aSdp78419 end = mblock->end_pfn; 1063ce8eb11aSdp78419 } 1064ce8eb11aSdp78419 1065ce8eb11aSdp78419 it->mi_mnode = mnode; 1066ce8eb11aSdp78419 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1067ce8eb11aSdp78419 it->mi_mblock_base = base; 1068ce8eb11aSdp78419 it->mi_mblock_end = end; 1069ce8eb11aSdp78419 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1070ce8eb11aSdp78419 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1071ce8eb11aSdp78419 it->mi_mnode_mask = max_locality_groups - 1; 1072ce8eb11aSdp78419 if (pfn < base) 1073ce8eb11aSdp78419 pfn = base; 1074ce8eb11aSdp78419 else if (pfn > end) 1075ce8eb11aSdp78419 pfn = (pfn_t)-1; 1076ce8eb11aSdp78419 return (pfn); 1077ce8eb11aSdp78419 } 1078ce8eb11aSdp78419 1079ce8eb11aSdp78419 /* 1080ce8eb11aSdp78419 * plat_mem_node_intersect_range() 1081ce8eb11aSdp78419 * 1082ce8eb11aSdp78419 * Find the intersection between a memnode and a range of pfn's. 1083ce8eb11aSdp78419 */ 1084ce8eb11aSdp78419 void 1085ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1086ce8eb11aSdp78419 int mnode, pgcnt_t *npages_out) 1087ce8eb11aSdp78419 { 1088ce8eb11aSdp78419 pfn_t offset, len, hole, base, end, test_end, frag; 1089ce8eb11aSdp78419 pfn_t nearest; 1090ce8eb11aSdp78419 mem_stripe_t *ms; 1091ce8eb11aSdp78419 int i, npages; 1092ce8eb11aSdp78419 1093ce8eb11aSdp78419 *npages_out = 0; 1094ce8eb11aSdp78419 1095ce8eb11aSdp78419 if (!mem_node_config[mnode].exists || test_len == 0) 1096ce8eb11aSdp78419 return; 1097ce8eb11aSdp78419 1098ce8eb11aSdp78419 base = mem_node_config[mnode].physbase; 1099ce8eb11aSdp78419 end = mem_node_config[mnode].physmax; 1100ce8eb11aSdp78419 1101ce8eb11aSdp78419 test_end = test_base + test_len - 1; 1102ce8eb11aSdp78419 if (end < test_base || base > test_end) 1103ce8eb11aSdp78419 return; 1104ce8eb11aSdp78419 1105ce8eb11aSdp78419 if (n_locality_groups == 1) { 1106ce8eb11aSdp78419 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1107ce8eb11aSdp78419 return; 1108ce8eb11aSdp78419 } 1109ce8eb11aSdp78419 1110ce8eb11aSdp78419 hole = mnode_stride - mnode_pages; 1111ce8eb11aSdp78419 npages = 0; 1112ce8eb11aSdp78419 1113ce8eb11aSdp78419 /* 1114ce8eb11aSdp78419 * Iterate over all the stripes for this mnode (one per mblock), 1115ce8eb11aSdp78419 * find the intersection with each, and accumulate the intersections. 1116ce8eb11aSdp78419 * 1117ce8eb11aSdp78419 * Determing the intersection with a stripe is tricky. If base or end 1118ce8eb11aSdp78419 * fall outside the mem_node bounds, round them to physbase/physmax of 1119ce8eb11aSdp78419 * mem_node. If base or end fall in a gap, round them to start of 1120ce8eb11aSdp78419 * nearest stripe. If they fall within a stripe, keep base or end, 1121ce8eb11aSdp78419 * but calculate the fragment size that should be excluded from the 1122ce8eb11aSdp78419 * stripe. Calculate how many strides fall in the adjusted range, 1123ce8eb11aSdp78419 * multiply by stripe width, and add the start and end fragments. 1124ce8eb11aSdp78419 */ 1125ce8eb11aSdp78419 1126ce8eb11aSdp78419 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1127ce8eb11aSdp78419 ms = &mem_stripes[i]; 1128ce8eb11aSdp78419 if (ms->exists && 1129ce8eb11aSdp78419 test_base <= (end = ms->physmax) && 1130ce8eb11aSdp78419 test_end >= (base = ms->physbase)) { 1131ce8eb11aSdp78419 1132ce8eb11aSdp78419 offset = ms->offset; 1133ce8eb11aSdp78419 1134ce8eb11aSdp78419 if (test_base > base) { 1135ce8eb11aSdp78419 /* Round test_base to next multiple of stride */ 1136ce8eb11aSdp78419 len = P2ROUNDUP(test_base - (base - offset), 1137ce8eb11aSdp78419 mnode_stride); 1138ce8eb11aSdp78419 nearest = base - offset + len; 1139ce8eb11aSdp78419 /* 1140ce8eb11aSdp78419 * Compute distance from test_base to the 1141ce8eb11aSdp78419 * stride boundary to see if test_base falls 1142ce8eb11aSdp78419 * in the stripe or in the hole. 1143ce8eb11aSdp78419 */ 1144ce8eb11aSdp78419 if (nearest - test_base > hole) { 1145ce8eb11aSdp78419 /* 1146ce8eb11aSdp78419 * test_base lies in stripe, 1147ce8eb11aSdp78419 * and offset should be excluded. 1148ce8eb11aSdp78419 */ 1149ce8eb11aSdp78419 offset = test_base - 1150ce8eb11aSdp78419 (nearest - mnode_stride); 1151ce8eb11aSdp78419 base = test_base; 1152ce8eb11aSdp78419 } else { 1153ce8eb11aSdp78419 /* round up to next stripe start */ 1154ce8eb11aSdp78419 offset = 0; 1155ce8eb11aSdp78419 base = nearest; 1156ce8eb11aSdp78419 if (base > end) 1157ce8eb11aSdp78419 continue; 1158ce8eb11aSdp78419 } 1159ce8eb11aSdp78419 1160ce8eb11aSdp78419 } 1161ce8eb11aSdp78419 1162ce8eb11aSdp78419 if (test_end < end) 1163ce8eb11aSdp78419 end = test_end; 1164ce8eb11aSdp78419 end++; /* adjust to an exclusive bound */ 1165ce8eb11aSdp78419 1166ce8eb11aSdp78419 /* Round end to next multiple of stride */ 1167ce8eb11aSdp78419 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1168ce8eb11aSdp78419 nearest = (base - offset) + len; 1169ce8eb11aSdp78419 if (nearest - end <= hole) { 1170ce8eb11aSdp78419 /* end falls in hole, use entire last stripe */ 1171ce8eb11aSdp78419 frag = 0; 1172ce8eb11aSdp78419 } else { 1173ce8eb11aSdp78419 /* end falls in stripe, compute fragment */ 1174ce8eb11aSdp78419 frag = nearest - hole - end; 1175ce8eb11aSdp78419 } 1176ce8eb11aSdp78419 1177ce8eb11aSdp78419 len = (len >> stripe_shift) - offset - frag; 1178ce8eb11aSdp78419 npages += len; 1179ce8eb11aSdp78419 } 1180ce8eb11aSdp78419 } 1181ce8eb11aSdp78419 1182ce8eb11aSdp78419 *npages_out = npages; 1183ce8eb11aSdp78419 } 1184ce8eb11aSdp78419 1185ce8eb11aSdp78419 /* 1186ce8eb11aSdp78419 * valid_pages() 1187ce8eb11aSdp78419 * 1188ce8eb11aSdp78419 * Return 1 if pages are valid and do not cross mnode boundaries 1189ce8eb11aSdp78419 * (which would break page free list assumptions), and 0 otherwise. 1190ce8eb11aSdp78419 */ 1191ce8eb11aSdp78419 1192ce8eb11aSdp78419 #define MNODE(pa) \ 1193ce8eb11aSdp78419 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1194ce8eb11aSdp78419 1195ce8eb11aSdp78419 static int 1196ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0) 1197ce8eb11aSdp78419 { 1198ce8eb11aSdp78419 int i, max_szc; 1199ce8eb11aSdp78419 uint64_t last_page_base, szc_mask; 1200ce8eb11aSdp78419 uint64_t max_page_len, max_coalesce_len; 1201ce8eb11aSdp78419 struct mblock_md *mb = mpo_mblock; 1202ce8eb11aSdp78419 1203ce8eb11aSdp78419 /* 1204ce8eb11aSdp78419 * Find the smaller of the largest page possible and supported. 1205ce8eb11aSdp78419 * mmu_exported_pagesize_mask is not yet initialized, so read 1206ce8eb11aSdp78419 * it from the MD. Apply minimal fixups in case of broken MDs 1207ce8eb11aSdp78419 * to get a sane mask. 1208ce8eb11aSdp78419 */ 1209ce8eb11aSdp78419 1210ce8eb11aSdp78419 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1211ce8eb11aSdp78419 szc_mask = 0; 1212ce8eb11aSdp78419 szc_mask |= (1 << TTE4M); /* largest in sun4v default support */ 1213ce8eb11aSdp78419 max_szc = highbit(szc_mask) - 1; 1214ce8eb11aSdp78419 if (max_szc > TTE256M) 1215ce8eb11aSdp78419 max_szc = TTE256M; 1216ce8eb11aSdp78419 max_page_len = TTEBYTES(max_szc); 1217ce8eb11aSdp78419 1218ce8eb11aSdp78419 /* 1219ce8eb11aSdp78419 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1220ce8eb11aSdp78419 * if mmu-page-size-list does not contain it, so 256M pages must fall 1221ce8eb11aSdp78419 * within one mnode to use MPO. 1222ce8eb11aSdp78419 */ 1223ce8eb11aSdp78419 max_coalesce_len = TTEBYTES(TTE256M); 1224ce8eb11aSdp78419 ASSERT(max_coalesce_len >= max_page_len); 1225ce8eb11aSdp78419 1226ce8eb11aSdp78419 if (ptob(mnode_pages) < max_coalesce_len) { 1227ce8eb11aSdp78419 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1228ce8eb11aSdp78419 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1229ce8eb11aSdp78419 return (0); 1230ce8eb11aSdp78419 } 1231ce8eb11aSdp78419 1232ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1233ce8eb11aSdp78419 uint64_t base = mb->base; 1234ce8eb11aSdp78419 uint64_t end = mb->base + mb->size - 1; 1235ce8eb11aSdp78419 uint64_t ra_to_pa = mb->ra_to_pa; 1236ce8eb11aSdp78419 1237ce8eb11aSdp78419 /* 1238ce8eb11aSdp78419 * If mblock is smaller than the max page size, then 1239ce8eb11aSdp78419 * RA = PA mod MAXPAGE is not guaranteed, but it must 1240ce8eb11aSdp78419 * not span mnodes. 1241ce8eb11aSdp78419 */ 1242ce8eb11aSdp78419 if (mb->size < max_page_len) { 1243ce8eb11aSdp78419 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1244ce8eb11aSdp78419 MPO_STATUS("Small mblock spans mnodes; " 1245ce8eb11aSdp78419 "MPO disabled: base = %lx, end = %lx, " 1246ce8eb11aSdp78419 "ra2pa = %lx\n", base, end, ra_to_pa); 1247ce8eb11aSdp78419 return (0); 1248ce8eb11aSdp78419 } 1249ce8eb11aSdp78419 } else { 1250ce8eb11aSdp78419 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1251ce8eb11aSdp78419 uint64_t pa_base = base + ra_to_pa; 1252ce8eb11aSdp78419 if ((base & (max_coalesce_len - 1)) != 1253ce8eb11aSdp78419 (pa_base & (max_coalesce_len - 1))) { 1254ce8eb11aSdp78419 MPO_STATUS("bad page alignment; MPO disabled: " 1255ce8eb11aSdp78419 "ra = %lx, pa = %lx, pagelen = %lx\n", 1256ce8eb11aSdp78419 base, pa_base, max_coalesce_len); 1257ce8eb11aSdp78419 return (0); 1258ce8eb11aSdp78419 } 1259ce8eb11aSdp78419 } 1260ce8eb11aSdp78419 1261ce8eb11aSdp78419 /* 1262ce8eb11aSdp78419 * Find start of last large page in mblock in RA space. 1263ce8eb11aSdp78419 * If page extends into the next mblock, verify the 1264ce8eb11aSdp78419 * mnode does not change. 1265ce8eb11aSdp78419 */ 1266ce8eb11aSdp78419 last_page_base = P2ALIGN(end, max_coalesce_len); 1267ce8eb11aSdp78419 if (i + 1 < n_mblocks && 1268ce8eb11aSdp78419 last_page_base + max_coalesce_len > mb[1].base && 1269ce8eb11aSdp78419 MNODE(last_page_base + ra_to_pa) != 1270ce8eb11aSdp78419 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1271ce8eb11aSdp78419 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1272ce8eb11aSdp78419 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1273ce8eb11aSdp78419 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1274ce8eb11aSdp78419 mb[1].ra_to_pa, max_coalesce_len); 1275ce8eb11aSdp78419 return (0); 1276ce8eb11aSdp78419 } 1277ce8eb11aSdp78419 1278ce8eb11aSdp78419 mb++; 1279ce8eb11aSdp78419 } 1280ce8eb11aSdp78419 return (1); 1281ce8eb11aSdp78419 } 1282ce8eb11aSdp78419 1283ce8eb11aSdp78419 1284ce8eb11aSdp78419 /* 1285ce8eb11aSdp78419 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1286ce8eb11aSdp78419 * if any, and remove them. This yields a config where the "coarse 1287ce8eb11aSdp78419 * grained" lgroups cover all of memory, even though part of that memory 1288ce8eb11aSdp78419 * is fine grain interleaved and does not deliver a purely local memory 1289ce8eb11aSdp78419 * latency. 1290ce8eb11aSdp78419 * 1291ce8eb11aSdp78419 * This function reads and modifies the globals: 1292ce8eb11aSdp78419 * mpo_lgroup[], n_lgrpnodes 1293ce8eb11aSdp78419 * 1294ce8eb11aSdp78419 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1295ce8eb11aSdp78419 */ 1296ce8eb11aSdp78419 1297ce8eb11aSdp78419 static int 1298ce8eb11aSdp78419 fix_interleave(void) 1299ce8eb11aSdp78419 { 1300ce8eb11aSdp78419 int i, j; 1301ce8eb11aSdp78419 uint64_t mask = 0; 1302ce8eb11aSdp78419 1303ce8eb11aSdp78419 j = 0; 1304ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 1305ce8eb11aSdp78419 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1306ce8eb11aSdp78419 /* remove this lgroup */ 1307ce8eb11aSdp78419 mask = mpo_lgroup[i].addr_mask; 1308ce8eb11aSdp78419 } else { 1309ce8eb11aSdp78419 mpo_lgroup[j++] = mpo_lgroup[i]; 1310ce8eb11aSdp78419 } 1311ce8eb11aSdp78419 } 1312ce8eb11aSdp78419 n_lgrpnodes = j; 1313ce8eb11aSdp78419 1314ce8eb11aSdp78419 if (mask != 0) 1315ce8eb11aSdp78419 MPO_STATUS("sub-page interleave %lx found; " 1316ce8eb11aSdp78419 "removing lgroup.\n", mask); 1317ce8eb11aSdp78419 1318ce8eb11aSdp78419 return (mask != 0); 1319ce8eb11aSdp78419 } 1320