1ce8eb11aSdp78419 /* 2ce8eb11aSdp78419 * CDDL HEADER START 3ce8eb11aSdp78419 * 4ce8eb11aSdp78419 * The contents of this file are subject to the terms of the 5ce8eb11aSdp78419 * Common Development and Distribution License (the "License"). 6ce8eb11aSdp78419 * You may not use this file except in compliance with the License. 7ce8eb11aSdp78419 * 8ce8eb11aSdp78419 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9ce8eb11aSdp78419 * or http://www.opensolaris.org/os/licensing. 10ce8eb11aSdp78419 * See the License for the specific language governing permissions 11ce8eb11aSdp78419 * and limitations under the License. 12ce8eb11aSdp78419 * 13ce8eb11aSdp78419 * When distributing Covered Code, include this CDDL HEADER in each 14ce8eb11aSdp78419 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15ce8eb11aSdp78419 * If applicable, add the following below this CDDL HEADER, with the 16ce8eb11aSdp78419 * fields enclosed by brackets "[]" replaced with your own identifying 17ce8eb11aSdp78419 * information: Portions Copyright [yyyy] [name of copyright owner] 18ce8eb11aSdp78419 * 19ce8eb11aSdp78419 * CDDL HEADER END 20ce8eb11aSdp78419 */ 21ce8eb11aSdp78419 22ce8eb11aSdp78419 /* 23*9853d9e8SJason Beloro * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24ce8eb11aSdp78419 * Use is subject to license terms. 25ce8eb11aSdp78419 */ 26ce8eb11aSdp78419 27ce8eb11aSdp78419 #include <sys/types.h> 28ce8eb11aSdp78419 #include <sys/sysmacros.h> 29ce8eb11aSdp78419 #include <sys/machsystm.h> 30ce8eb11aSdp78419 #include <sys/machparam.h> 31ce8eb11aSdp78419 #include <sys/cmn_err.h> 32ce8eb11aSdp78419 #include <sys/stat.h> 33ce8eb11aSdp78419 #include <sys/mach_descrip.h> 34ce8eb11aSdp78419 #include <sys/memnode.h> 35ce8eb11aSdp78419 #include <sys/mdesc.h> 36ce8eb11aSdp78419 #include <sys/mpo.h> 37*9853d9e8SJason Beloro #include <vm/page.h> 38ce8eb11aSdp78419 #include <vm/vm_dep.h> 39e853d8c3Sjc25722 #include <vm/hat_sfmmu.h> 40bb57d1f5Sjc25722 #include <sys/promif.h> 41ce8eb11aSdp78419 42ce8eb11aSdp78419 /* 43ce8eb11aSdp78419 * MPO and the sun4v memory representation 44ce8eb11aSdp78419 * --------------------------------------- 45ce8eb11aSdp78419 * 46ce8eb11aSdp78419 * Latency groups are defined in the sun4v achitecture by memory-latency-group 47ce8eb11aSdp78419 * nodes in the Machine Description, as specified in FWARC/2007/260. These 48ce8eb11aSdp78419 * tie together cpu nodes and mblock nodes, and contain mask and match 49ce8eb11aSdp78419 * properties that identify the portion of an mblock that belongs to the 50ce8eb11aSdp78419 * lgroup. Mask and match are defined in the Physical Address (PA) space, 51ce8eb11aSdp78419 * but an mblock defines Real Addresses (RA). To translate, the mblock 52ce8eb11aSdp78419 * includes the property address-congruence-offset, hereafter referred to as 53ce8eb11aSdp78419 * ra_to_pa. A real address ra is a member of an lgroup if 54ce8eb11aSdp78419 * 55ce8eb11aSdp78419 * (ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match 56ce8eb11aSdp78419 * 57ce8eb11aSdp78419 * The MD is traversed, and information on all mblocks is kept in the array 58ce8eb11aSdp78419 * mpo_mblock[]. Information on all CPUs, including which lgroup they map 59ce8eb11aSdp78419 * to, is kept in the array mpo_cpu[]. 60ce8eb11aSdp78419 * 61ce8eb11aSdp78419 * This implementation makes (and verifies) the simplifying assumption that 62ce8eb11aSdp78419 * the mask bits are the same for all defined lgroups, and that all 1 bits in 63ce8eb11aSdp78419 * the mask are contiguous. Thus the number of lgroups is bounded by the 64ce8eb11aSdp78419 * number of possible mask values, and the lgrp_handle_t is defined as the 65ce8eb11aSdp78419 * mask value, shifted right to eliminate the 0 bit positions in mask. The 66ce8eb11aSdp78419 * masks and values are also referred to as "home bits" in the code. 67ce8eb11aSdp78419 * 68ce8eb11aSdp78419 * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup 69ce8eb11aSdp78419 * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock 70ce8eb11aSdp78419 * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the 71ce8eb11aSdp78419 * home bits. This yields the mem_node. 72ce8eb11aSdp78419 * 73ce8eb11aSdp78419 * Interfaces 74ce8eb11aSdp78419 * ---------- 75ce8eb11aSdp78419 * 76ce8eb11aSdp78419 * This file exports the following entry points: 77ce8eb11aSdp78419 * 78ce8eb11aSdp78419 * plat_lgrp_init() 79ce8eb11aSdp78419 * plat_build_mem_nodes() 80ce8eb11aSdp78419 * plat_lgrp_cpu_to_hand() 81ce8eb11aSdp78419 * plat_lgrp_latency() 82ce8eb11aSdp78419 * plat_pfn_to_mem_node() 83ce8eb11aSdp78419 * These implement the usual platform lgroup interfaces. 84ce8eb11aSdp78419 * 85ce8eb11aSdp78419 * plat_rapfn_to_papfn() 86ce8eb11aSdp78419 * Recover the PA page coloring bits from an RA. 87ce8eb11aSdp78419 * 88ce8eb11aSdp78419 * plat_mem_node_iterator_init() 89ce8eb11aSdp78419 * Initialize an iterator to efficiently step through pages in a mem_node. 90ce8eb11aSdp78419 * 91ce8eb11aSdp78419 * plat_mem_node_intersect_range() 92ce8eb11aSdp78419 * Find the intersection with a mem_node. 93*9853d9e8SJason Beloro * 94*9853d9e8SJason Beloro * plat_slice_add() 95*9853d9e8SJason Beloro * plat_slice_del() 96*9853d9e8SJason Beloro * Platform hooks to add/delete a pfn range. 97*9853d9e8SJason Beloro * 98*9853d9e8SJason Beloro * Internal Organization 99*9853d9e8SJason Beloro * --------------------- 100*9853d9e8SJason Beloro * 101*9853d9e8SJason Beloro * A number of routines are used both boot/DR code which (re)build 102*9853d9e8SJason Beloro * appropriate MPO structures. 103*9853d9e8SJason Beloro * 104*9853d9e8SJason Beloro * mblock_alloc() 105*9853d9e8SJason Beloro * Allocate memory for mblocks and stripes as 106*9853d9e8SJason Beloro * appropriate for boot or memory DR. 107*9853d9e8SJason Beloro * 108*9853d9e8SJason Beloro * mblock_free() 109*9853d9e8SJason Beloro * Free memory allocated by mblock_alloc. 110*9853d9e8SJason Beloro * 111*9853d9e8SJason Beloro * mblock_update() 112*9853d9e8SJason Beloro * Build mblocks based on mblock nodes read from the MD. 113*9853d9e8SJason Beloro * 114*9853d9e8SJason Beloro * mblock_update_add() 115*9853d9e8SJason Beloro * Rebuild mblocks after a memory DR add operation. 116*9853d9e8SJason Beloro * 117*9853d9e8SJason Beloro * mblock_update_del() 118*9853d9e8SJason Beloro * Rebuild mblocks after a memory DR delete operation. 119*9853d9e8SJason Beloro * 120*9853d9e8SJason Beloro * mblock_install() 121*9853d9e8SJason Beloro * Install mblocks as the new configuration. 122*9853d9e8SJason Beloro * 123*9853d9e8SJason Beloro * mstripe_update() 124*9853d9e8SJason Beloro * Build stripes based on mblocks. 125*9853d9e8SJason Beloro * 126*9853d9e8SJason Beloro * mnode_update() 127*9853d9e8SJason Beloro * Call memnode layer to add/del a pfn range, based on stripes. 128*9853d9e8SJason Beloro * 129*9853d9e8SJason Beloro * The platform interfaces allocate all memory required for the 130*9853d9e8SJason Beloro * particualar update first, block access to the MPO structures 131*9853d9e8SJason Beloro * while they are updated, and free old structures after the update. 132ce8eb11aSdp78419 */ 133ce8eb11aSdp78419 134ce8eb11aSdp78419 int sun4v_mpo_enable = 1; 135ce8eb11aSdp78419 int sun4v_mpo_debug = 0; 136ce8eb11aSdp78419 char sun4v_mpo_status[256] = ""; 137ce8eb11aSdp78419 138ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */ 139ce8eb11aSdp78419 static struct cpu_md mpo_cpu[NCPU]; 140ce8eb11aSdp78419 141ce8eb11aSdp78419 /* Save lgroup info from the MD */ 142ce8eb11aSdp78419 #define MAX_MD_LGROUPS 32 143ce8eb11aSdp78419 static struct lgrp_md mpo_lgroup[MAX_MD_LGROUPS]; 144ce8eb11aSdp78419 static int n_lgrpnodes = 0; 145ce8eb11aSdp78419 static int n_locality_groups = 0; 146ce8eb11aSdp78419 static int max_locality_groups = 0; 147*9853d9e8SJason Beloro static int szc_mask0 = 0; 148ce8eb11aSdp78419 149ce8eb11aSdp78419 /* Save mblocks from the MD */ 150bb57d1f5Sjc25722 #define SMALL_MBLOCKS_COUNT 8 151bb57d1f5Sjc25722 static struct mblock_md *mpo_mblock; 152bb57d1f5Sjc25722 static struct mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT]; 153ce8eb11aSdp78419 static int n_mblocks = 0; 154ce8eb11aSdp78419 155ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */ 156bb57d1f5Sjc25722 static mem_stripe_t *mem_stripes; 157bb57d1f5Sjc25722 static mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES]; 158ce8eb11aSdp78419 static int n_mem_stripes = 0; 159ce8eb11aSdp78419 static pfn_t mnode_stride; /* distance between stripes, start to start */ 160ce8eb11aSdp78419 static int stripe_shift; /* stride/stripes expressed as a shift */ 161ce8eb11aSdp78419 static pfn_t mnode_pages; /* mem_node stripe width */ 162ce8eb11aSdp78419 163ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */ 164ce8eb11aSdp78419 static uint64_t home_mask = 0; 165ce8eb11aSdp78419 static pfn_t home_mask_pfn = 0; 166ce8eb11aSdp78419 static int home_mask_shift = 0; 167ce8eb11aSdp78419 static uint_t home_mask_pfn_shift = 0; 168ce8eb11aSdp78419 169ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */ 170ce8eb11aSdp78419 static int lower_latency = 0; 171ce8eb11aSdp78419 static int higher_latency = 0; 172ce8eb11aSdp78419 173ce8eb11aSdp78419 static pfn_t base_ra_to_pa_pfn = 0; /* ra_to_pa for single mblock memory */ 174*9853d9e8SJason Beloro static int mpo_genid; /* config gen; updated by mem DR */ 175*9853d9e8SJason Beloro static mpo_config_t mpo_config; /* current mblocks and stripes */ 176*9853d9e8SJason Beloro 177*9853d9e8SJason Beloro typedef enum { U_ADD, U_ADD_ALL, U_DEL } update_t; 178ce8eb11aSdp78419 179ce8eb11aSdp78419 static int valid_pages(md_t *md, mde_cookie_t cpu0); 180ce8eb11aSdp78419 static int unique_home_mem_lg_count(uint64_t mem_lg_homeset); 181ce8eb11aSdp78419 static int fix_interleave(void); 182ce8eb11aSdp78419 183*9853d9e8SJason Beloro static int mblock_alloc(mpo_config_t *, update_t, int nmblocks); 184*9853d9e8SJason Beloro static void mblock_install(mpo_config_t *); 185*9853d9e8SJason Beloro static void mblock_free(mpo_config_t *); 186*9853d9e8SJason Beloro static void mblock_update(mpo_config_t *, md_t, mde_cookie_t *mblocknodes); 187*9853d9e8SJason Beloro static void mblock_update_add(mpo_config_t *); 188*9853d9e8SJason Beloro static void mblock_update_del(mpo_config_t *, mpo_config_t *, pfn_t, pfn_t); 189*9853d9e8SJason Beloro static void mstripe_update(mpo_config_t *); 190*9853d9e8SJason Beloro static void mnode_update(mpo_config_t *, pfn_t, pfn_t, update_t); 191*9853d9e8SJason Beloro 192ce8eb11aSdp78419 /* Debug support */ 193ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 194*9853d9e8SJason Beloro #define VALIDATE_SLICE(base, end) { \ 195*9853d9e8SJason Beloro ASSERT(IS_P2ALIGNED(ptob(base), TTEBYTES(TTE256M))); \ 196*9853d9e8SJason Beloro ASSERT(IS_P2ALIGNED(ptob(end - base + 1), TTEBYTES(TTE256M))); \ 197*9853d9e8SJason Beloro } 198ce8eb11aSdp78419 #define MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args) 199ce8eb11aSdp78419 #else 200*9853d9e8SJason Beloro #define VALIDATE_SLICE(base, end) 201ce8eb11aSdp78419 #define MPO_DEBUG(...) 202ce8eb11aSdp78419 #endif /* DEBUG */ 203ce8eb11aSdp78419 204ce8eb11aSdp78419 /* Record status message, viewable from mdb */ 205ce8eb11aSdp78419 #define MPO_STATUS(args...) { \ 206ce8eb11aSdp78419 (void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args); \ 207ce8eb11aSdp78419 MPO_DEBUG(sun4v_mpo_status); \ 208ce8eb11aSdp78419 } 209ce8eb11aSdp78419 210ce8eb11aSdp78419 /* 211*9853d9e8SJason Beloro * The MPO locks are to protect the MPO metadata while that 212*9853d9e8SJason Beloro * information is updated as a result of a memory DR operation. 213*9853d9e8SJason Beloro * The read lock must be acquired to read the metadata and the 214*9853d9e8SJason Beloro * write locks must be acquired to update it. 215*9853d9e8SJason Beloro */ 216*9853d9e8SJason Beloro #define mpo_rd_lock kpreempt_disable 217*9853d9e8SJason Beloro #define mpo_rd_unlock kpreempt_enable 218*9853d9e8SJason Beloro 219*9853d9e8SJason Beloro static void 220*9853d9e8SJason Beloro mpo_wr_lock() 221*9853d9e8SJason Beloro { 222*9853d9e8SJason Beloro mutex_enter(&cpu_lock); 223*9853d9e8SJason Beloro pause_cpus(NULL); 224*9853d9e8SJason Beloro mutex_exit(&cpu_lock); 225*9853d9e8SJason Beloro } 226*9853d9e8SJason Beloro 227*9853d9e8SJason Beloro static void 228*9853d9e8SJason Beloro mpo_wr_unlock() 229*9853d9e8SJason Beloro { 230*9853d9e8SJason Beloro mutex_enter(&cpu_lock); 231*9853d9e8SJason Beloro start_cpus(); 232*9853d9e8SJason Beloro mutex_exit(&cpu_lock); 233*9853d9e8SJason Beloro } 234*9853d9e8SJason Beloro 235*9853d9e8SJason Beloro /* 236ce8eb11aSdp78419 * Routine to read a uint64_t from a given md 237ce8eb11aSdp78419 */ 238ce8eb11aSdp78419 static int64_t 239ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val) 240ce8eb11aSdp78419 { 241ce8eb11aSdp78419 int err = md_get_prop_val(md, node, propname, val); 242ce8eb11aSdp78419 return (err); 243ce8eb11aSdp78419 } 244ce8eb11aSdp78419 245ce8eb11aSdp78419 static int 246ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b) 247ce8eb11aSdp78419 { 248ce8eb11aSdp78419 struct mblock_md *m1 = (struct mblock_md *)a; 249ce8eb11aSdp78419 struct mblock_md *m2 = (struct mblock_md *)b; 250ce8eb11aSdp78419 251ce8eb11aSdp78419 if (m1->base < m2->base) 252ce8eb11aSdp78419 return (-1); 253ce8eb11aSdp78419 else if (m1->base == m2->base) 254ce8eb11aSdp78419 return (0); 255ce8eb11aSdp78419 else 256ce8eb11aSdp78419 return (1); 257ce8eb11aSdp78419 } 258ce8eb11aSdp78419 259ce8eb11aSdp78419 static void 260ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n) 261ce8eb11aSdp78419 { 262ce8eb11aSdp78419 extern void qsort(void *, size_t, size_t, 263ce8eb11aSdp78419 int (*)(const void *, const void *)); 264ce8eb11aSdp78419 265ce8eb11aSdp78419 qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp); 266ce8eb11aSdp78419 } 267ce8eb11aSdp78419 268924db11bSjc25722 static void 269924db11bSjc25722 mpo_update_tunables(void) 270924db11bSjc25722 { 271924db11bSjc25722 int i, ncpu_min; 272924db11bSjc25722 273924db11bSjc25722 /* 274924db11bSjc25722 * lgrp_expand_proc_thresh is the minimum load on the lgroups 275924db11bSjc25722 * this process is currently running on before considering 276924db11bSjc25722 * expanding threads to another lgroup. 277924db11bSjc25722 * 278924db11bSjc25722 * lgrp_expand_proc_diff determines how much less the remote lgroup 279924db11bSjc25722 * must be loaded before expanding to it. 280924db11bSjc25722 * 281924db11bSjc25722 * On sun4v CMT processors, threads share a core pipeline, and 282924db11bSjc25722 * at less than 100% utilization, best throughput is obtained by 283924db11bSjc25722 * spreading threads across more cores, even if some are in a 284924db11bSjc25722 * different lgroup. Spread threads to a new lgroup if the 285924db11bSjc25722 * current group is more than 50% loaded. Because of virtualization, 286924db11bSjc25722 * lgroups may have different numbers of CPUs, but the tunables 287924db11bSjc25722 * apply to all lgroups, so find the smallest lgroup and compute 288924db11bSjc25722 * 50% loading. 289924db11bSjc25722 */ 290924db11bSjc25722 291924db11bSjc25722 ncpu_min = NCPU; 292924db11bSjc25722 for (i = 0; i < n_lgrpnodes; i++) { 293924db11bSjc25722 int ncpu = mpo_lgroup[i].ncpu; 294924db11bSjc25722 if (ncpu != 0 && ncpu < ncpu_min) 295924db11bSjc25722 ncpu_min = ncpu; 296924db11bSjc25722 } 297924db11bSjc25722 lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2; 298924db11bSjc25722 299924db11bSjc25722 /* new home may only be half as loaded as the existing home to use it */ 300924db11bSjc25722 lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2; 301924db11bSjc25722 302924db11bSjc25722 lgrp_loadavg_tolerance = lgrp_loadavg_max_effect; 303924db11bSjc25722 } 304924db11bSjc25722 305924db11bSjc25722 static mde_cookie_t 306924db11bSjc25722 cpuid_to_cpunode(md_t *md, int cpuid) 307924db11bSjc25722 { 308924db11bSjc25722 mde_cookie_t rootnode, foundnode, *cpunodes; 309924db11bSjc25722 uint64_t cpuid_prop; 310924db11bSjc25722 int n_cpunodes, i; 311924db11bSjc25722 312924db11bSjc25722 if (md == NULL) 313924db11bSjc25722 return (MDE_INVAL_ELEM_COOKIE); 314924db11bSjc25722 315924db11bSjc25722 rootnode = md_root_node(md); 316924db11bSjc25722 if (rootnode == MDE_INVAL_ELEM_COOKIE) 317924db11bSjc25722 return (MDE_INVAL_ELEM_COOKIE); 318924db11bSjc25722 319924db11bSjc25722 n_cpunodes = md_alloc_scan_dag(md, rootnode, PROP_LG_CPU, 320924db11bSjc25722 "fwd", &cpunodes); 321924db11bSjc25722 if (n_cpunodes <= 0 || n_cpunodes > NCPU) 322924db11bSjc25722 goto cpuid_fail; 323924db11bSjc25722 324924db11bSjc25722 for (i = 0; i < n_cpunodes; i++) { 325924db11bSjc25722 if (md_get_prop_val(md, cpunodes[i], PROP_LG_CPU_ID, 326924db11bSjc25722 &cpuid_prop)) 327924db11bSjc25722 break; 328924db11bSjc25722 if (cpuid_prop == (uint64_t)cpuid) { 329924db11bSjc25722 foundnode = cpunodes[i]; 330924db11bSjc25722 md_free_scan_dag(md, &cpunodes); 331924db11bSjc25722 return (foundnode); 332924db11bSjc25722 } 333924db11bSjc25722 } 334924db11bSjc25722 cpuid_fail: 335924db11bSjc25722 if (n_cpunodes > 0) 336924db11bSjc25722 md_free_scan_dag(md, &cpunodes); 337924db11bSjc25722 return (MDE_INVAL_ELEM_COOKIE); 338924db11bSjc25722 } 339924db11bSjc25722 340924db11bSjc25722 static int 341924db11bSjc25722 mpo_cpu_to_lgroup(md_t *md, mde_cookie_t cpunode) 342924db11bSjc25722 { 343924db11bSjc25722 mde_cookie_t *nodes; 344924db11bSjc25722 uint64_t latency, lowest_latency; 345924db11bSjc25722 uint64_t address_match, lowest_address_match; 346924db11bSjc25722 int n_lgroups, j, result = 0; 347924db11bSjc25722 348924db11bSjc25722 /* Find lgroup nodes reachable from this cpu */ 349924db11bSjc25722 n_lgroups = md_alloc_scan_dag(md, cpunode, PROP_LG_MEM_LG, 350924db11bSjc25722 "fwd", &nodes); 351924db11bSjc25722 352924db11bSjc25722 lowest_latency = ~(0UL); 353924db11bSjc25722 354924db11bSjc25722 /* Find the lgroup node with the smallest latency */ 355924db11bSjc25722 for (j = 0; j < n_lgroups; j++) { 356924db11bSjc25722 result = get_int(md, nodes[j], PROP_LG_LATENCY, 357924db11bSjc25722 &latency); 358924db11bSjc25722 result |= get_int(md, nodes[j], PROP_LG_MATCH, 359924db11bSjc25722 &address_match); 360924db11bSjc25722 if (result != 0) { 361924db11bSjc25722 j = -1; 362924db11bSjc25722 goto to_lgrp_done; 363924db11bSjc25722 } 364924db11bSjc25722 if (latency < lowest_latency) { 365924db11bSjc25722 lowest_latency = latency; 366924db11bSjc25722 lowest_address_match = address_match; 367924db11bSjc25722 } 368924db11bSjc25722 } 369924db11bSjc25722 for (j = 0; j < n_lgrpnodes; j++) { 370924db11bSjc25722 if ((mpo_lgroup[j].latency == lowest_latency) && 371924db11bSjc25722 (mpo_lgroup[j].addr_match == lowest_address_match)) 372924db11bSjc25722 break; 373924db11bSjc25722 } 374924db11bSjc25722 if (j == n_lgrpnodes) 375924db11bSjc25722 j = -1; 376924db11bSjc25722 377924db11bSjc25722 to_lgrp_done: 378924db11bSjc25722 if (n_lgroups > 0) 379924db11bSjc25722 md_free_scan_dag(md, &nodes); 380924db11bSjc25722 return (j); 381924db11bSjc25722 } 382924db11bSjc25722 383924db11bSjc25722 /* Called when DR'ing in a CPU */ 384924db11bSjc25722 void 385924db11bSjc25722 mpo_cpu_add(int cpuid) 386924db11bSjc25722 { 387924db11bSjc25722 md_t *md; 388924db11bSjc25722 mde_cookie_t cpunode; 389924db11bSjc25722 390924db11bSjc25722 int i; 391924db11bSjc25722 392924db11bSjc25722 if (n_lgrpnodes <= 0) 393924db11bSjc25722 return; 394924db11bSjc25722 395924db11bSjc25722 md = md_get_handle(); 396924db11bSjc25722 397924db11bSjc25722 if (md == NULL) 398924db11bSjc25722 goto add_fail; 399924db11bSjc25722 400924db11bSjc25722 cpunode = cpuid_to_cpunode(md, cpuid); 401924db11bSjc25722 if (cpunode == MDE_INVAL_ELEM_COOKIE) 402924db11bSjc25722 goto add_fail; 403924db11bSjc25722 404924db11bSjc25722 i = mpo_cpu_to_lgroup(md, cpunode); 405924db11bSjc25722 if (i == -1) 406924db11bSjc25722 goto add_fail; 407924db11bSjc25722 408924db11bSjc25722 mpo_cpu[cpuid].lgrp_index = i; 409924db11bSjc25722 mpo_cpu[cpuid].home = mpo_lgroup[i].addr_match >> home_mask_shift; 410924db11bSjc25722 mpo_lgroup[i].ncpu++; 411924db11bSjc25722 mpo_update_tunables(); 412924db11bSjc25722 (void) md_fini_handle(md); 413924db11bSjc25722 return; 414924db11bSjc25722 add_fail: 415924db11bSjc25722 panic("mpo_cpu_add: Cannot read MD"); 416924db11bSjc25722 } 417924db11bSjc25722 418924db11bSjc25722 /* Called when DR'ing out a CPU */ 419924db11bSjc25722 void 420924db11bSjc25722 mpo_cpu_remove(int cpuid) 421924db11bSjc25722 { 422924db11bSjc25722 int i; 423924db11bSjc25722 424924db11bSjc25722 if (n_lgrpnodes <= 0) 425924db11bSjc25722 return; 426924db11bSjc25722 427924db11bSjc25722 i = mpo_cpu[cpuid].lgrp_index; 428924db11bSjc25722 mpo_lgroup[i].ncpu--; 429924db11bSjc25722 mpo_cpu[cpuid].home = 0; 430924db11bSjc25722 mpo_cpu[cpuid].lgrp_index = -1; 431924db11bSjc25722 mpo_update_tunables(); 432924db11bSjc25722 } 433924db11bSjc25722 434*9853d9e8SJason Beloro static mde_cookie_t 435*9853d9e8SJason Beloro md_get_root(md_t *md) 436ce8eb11aSdp78419 { 437*9853d9e8SJason Beloro mde_cookie_t root = MDE_INVAL_ELEM_COOKIE; 438*9853d9e8SJason Beloro int n_nodes; 439ce8eb11aSdp78419 440ce8eb11aSdp78419 n_nodes = md_node_count(md); 441ce8eb11aSdp78419 442ce8eb11aSdp78419 if (n_nodes <= 0) { 443*9853d9e8SJason Beloro MPO_STATUS("md_get_root: No nodes in node count\n"); 444*9853d9e8SJason Beloro return (root); 445ce8eb11aSdp78419 } 446ce8eb11aSdp78419 447ce8eb11aSdp78419 root = md_root_node(md); 448ce8eb11aSdp78419 449ce8eb11aSdp78419 if (root == MDE_INVAL_ELEM_COOKIE) { 450*9853d9e8SJason Beloro MPO_STATUS("md_get_root: Root node is missing\n"); 451*9853d9e8SJason Beloro return (root); 452ce8eb11aSdp78419 } 453ce8eb11aSdp78419 454*9853d9e8SJason Beloro MPO_DEBUG("md_get_root: Node Count: %d\n", n_nodes); 455*9853d9e8SJason Beloro MPO_DEBUG("md_get_root: md: %p\n", md); 456*9853d9e8SJason Beloro MPO_DEBUG("md_get_root: root: %lx\n", root); 457*9853d9e8SJason Beloro done: 458*9853d9e8SJason Beloro return (root); 459ce8eb11aSdp78419 } 460ce8eb11aSdp78419 461*9853d9e8SJason Beloro static int 462*9853d9e8SJason Beloro lgrp_update(md_t *md, mde_cookie_t root) 463*9853d9e8SJason Beloro { 464*9853d9e8SJason Beloro int i, j, result; 465*9853d9e8SJason Beloro int ret_val = 0; 466*9853d9e8SJason Beloro int sub_page_fix; 467*9853d9e8SJason Beloro mde_cookie_t *nodes, *lgrpnodes; 468ce8eb11aSdp78419 469ce8eb11aSdp78419 n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG, 470ce8eb11aSdp78419 "fwd", &lgrpnodes); 471ce8eb11aSdp78419 472ce8eb11aSdp78419 if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) { 473*9853d9e8SJason Beloro MPO_STATUS("lgrp_update: No Lgroups\n"); 474ce8eb11aSdp78419 ret_val = -1; 475ce8eb11aSdp78419 goto fail; 476ce8eb11aSdp78419 } 477ce8eb11aSdp78419 478*9853d9e8SJason Beloro MPO_DEBUG("lgrp_update: mem_lgs: %d\n", n_lgrpnodes); 479ce8eb11aSdp78419 480ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 481ce8eb11aSdp78419 mpo_lgroup[i].node = lgrpnodes[i]; 482ce8eb11aSdp78419 mpo_lgroup[i].id = i; 483ce8eb11aSdp78419 mpo_lgroup[i].ncpu = 0; 484ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_MASK, 485ce8eb11aSdp78419 &mpo_lgroup[i].addr_mask); 486ce8eb11aSdp78419 result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH, 487ce8eb11aSdp78419 &mpo_lgroup[i].addr_match); 488ce8eb11aSdp78419 489ce8eb11aSdp78419 /* 490ce8eb11aSdp78419 * If either the mask or match properties are missing, set to 0 491ce8eb11aSdp78419 */ 492ce8eb11aSdp78419 if (result < 0) { 493ce8eb11aSdp78419 mpo_lgroup[i].addr_mask = 0; 494ce8eb11aSdp78419 mpo_lgroup[i].addr_match = 0; 495ce8eb11aSdp78419 } 496ce8eb11aSdp78419 497ce8eb11aSdp78419 /* Set latency to 0 if property not present */ 498ce8eb11aSdp78419 499ce8eb11aSdp78419 result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY, 500ce8eb11aSdp78419 &mpo_lgroup[i].latency); 501ce8eb11aSdp78419 if (result < 0) 502ce8eb11aSdp78419 mpo_lgroup[i].latency = 0; 503ce8eb11aSdp78419 } 504ce8eb11aSdp78419 505ce8eb11aSdp78419 /* 506ce8eb11aSdp78419 * Sub-page level interleave is not yet supported. Check for it, 507ce8eb11aSdp78419 * and remove sub-page interleaved lgroups from mpo_lgroup and 508ce8eb11aSdp78419 * n_lgrpnodes. If no lgroups are left, return. 509ce8eb11aSdp78419 */ 510ce8eb11aSdp78419 511ce8eb11aSdp78419 sub_page_fix = fix_interleave(); 512ce8eb11aSdp78419 if (n_lgrpnodes == 0) { 513ce8eb11aSdp78419 ret_val = -1; 514ce8eb11aSdp78419 goto fail; 515ce8eb11aSdp78419 } 516ce8eb11aSdp78419 517ce8eb11aSdp78419 /* Ensure that all of the addr_mask values are the same */ 518ce8eb11aSdp78419 519ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 520ce8eb11aSdp78419 if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) { 521*9853d9e8SJason Beloro MPO_STATUS("lgrp_update: " 522ce8eb11aSdp78419 "addr_mask values are not the same\n"); 523ce8eb11aSdp78419 ret_val = -1; 524ce8eb11aSdp78419 goto fail; 525ce8eb11aSdp78419 } 526ce8eb11aSdp78419 } 527ce8eb11aSdp78419 528ce8eb11aSdp78419 /* 529ce8eb11aSdp78419 * Ensure that all lgrp nodes see all the mblocks. However, if 530ce8eb11aSdp78419 * sub-page interleave is being fixed, they do not, so skip 531ce8eb11aSdp78419 * the check. 532ce8eb11aSdp78419 */ 533ce8eb11aSdp78419 534ce8eb11aSdp78419 if (sub_page_fix == 0) { 535ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 536ce8eb11aSdp78419 j = md_alloc_scan_dag(md, mpo_lgroup[i].node, 537ce8eb11aSdp78419 PROP_LG_MBLOCK, "fwd", &nodes); 538ce8eb11aSdp78419 md_free_scan_dag(md, &nodes); 539ce8eb11aSdp78419 if (j != n_mblocks) { 540*9853d9e8SJason Beloro MPO_STATUS("lgrp_update: " 541ce8eb11aSdp78419 "sub-page interleave is being fixed\n"); 542ce8eb11aSdp78419 ret_val = -1; 543ce8eb11aSdp78419 goto fail; 544ce8eb11aSdp78419 } 545ce8eb11aSdp78419 } 546ce8eb11aSdp78419 } 547*9853d9e8SJason Beloro fail: 548*9853d9e8SJason Beloro if (n_lgrpnodes > 0) { 549*9853d9e8SJason Beloro md_free_scan_dag(md, &lgrpnodes); 550*9853d9e8SJason Beloro for (i = 0; i < n_lgrpnodes; i++) 551*9853d9e8SJason Beloro mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE; 552*9853d9e8SJason Beloro } 553*9853d9e8SJason Beloro 554*9853d9e8SJason Beloro return (ret_val); 555*9853d9e8SJason Beloro } 556*9853d9e8SJason Beloro 557*9853d9e8SJason Beloro /* 558*9853d9e8SJason Beloro * 559*9853d9e8SJason Beloro * Traverse the MD to determine: 560*9853d9e8SJason Beloro * 561*9853d9e8SJason Beloro * Number of CPU nodes, lgrp_nodes, and mblocks 562*9853d9e8SJason Beloro * Then for each lgrp_node, obtain the appropriate data. 563*9853d9e8SJason Beloro * For each CPU, determine its home locality and store it. 564*9853d9e8SJason Beloro * For each mblock, retrieve its data and store it. 565*9853d9e8SJason Beloro */ 566*9853d9e8SJason Beloro static int 567*9853d9e8SJason Beloro lgrp_traverse(md_t *md) 568*9853d9e8SJason Beloro { 569*9853d9e8SJason Beloro mde_cookie_t root, *cpunodes, *mblocknodes; 570*9853d9e8SJason Beloro int o; 571*9853d9e8SJason Beloro uint64_t i, k, stripe, stride; 572*9853d9e8SJason Beloro uint64_t mem_lg_homeset = 0; 573*9853d9e8SJason Beloro int ret_val = 0; 574*9853d9e8SJason Beloro int result = 0; 575*9853d9e8SJason Beloro int n_cpunodes = 0; 576*9853d9e8SJason Beloro mpo_config_t new_config; 577*9853d9e8SJason Beloro 578*9853d9e8SJason Beloro if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) { 579*9853d9e8SJason Beloro ret_val = -1; 580*9853d9e8SJason Beloro goto fail; 581*9853d9e8SJason Beloro } 582*9853d9e8SJason Beloro 583*9853d9e8SJason Beloro n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd", 584*9853d9e8SJason Beloro &mblocknodes); 585*9853d9e8SJason Beloro if (n_mblocks <= 0) { 586*9853d9e8SJason Beloro MPO_STATUS("lgrp_traverse: No mblock nodes detected in Machine " 587*9853d9e8SJason Beloro "Descriptor\n"); 588*9853d9e8SJason Beloro ret_val = -1; 589*9853d9e8SJason Beloro goto fail; 590*9853d9e8SJason Beloro } 591*9853d9e8SJason Beloro 592*9853d9e8SJason Beloro /* 593*9853d9e8SJason Beloro * Build the Memory Nodes. Do this before any possibility of 594*9853d9e8SJason Beloro * bailing from this routine so we obtain ra_to_pa (needed for page 595*9853d9e8SJason Beloro * coloring) even when there are no lgroups defined. 596*9853d9e8SJason Beloro */ 597*9853d9e8SJason Beloro if (mblock_alloc(&new_config, U_ADD_ALL, n_mblocks) < 0) { 598*9853d9e8SJason Beloro ret_val = -1; 599*9853d9e8SJason Beloro goto fail; 600*9853d9e8SJason Beloro } 601*9853d9e8SJason Beloro 602*9853d9e8SJason Beloro mblock_update(&new_config, md, mblocknodes); 603*9853d9e8SJason Beloro mblock_install(&new_config); 604*9853d9e8SJason Beloro 605*9853d9e8SJason Beloro /* Page coloring hook is required so we can iterate through mnodes */ 606*9853d9e8SJason Beloro if (&page_next_pfn_for_color_cpu == NULL) { 607*9853d9e8SJason Beloro MPO_STATUS("lgrp_traverse: No page coloring support\n"); 608*9853d9e8SJason Beloro ret_val = -1; 609*9853d9e8SJason Beloro goto fail; 610*9853d9e8SJason Beloro } 611*9853d9e8SJason Beloro 612*9853d9e8SJason Beloro /* Global enable for mpo */ 613*9853d9e8SJason Beloro if (sun4v_mpo_enable == 0) { 614*9853d9e8SJason Beloro MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n"); 615*9853d9e8SJason Beloro ret_val = -1; 616*9853d9e8SJason Beloro goto fail; 617*9853d9e8SJason Beloro } 618*9853d9e8SJason Beloro 619*9853d9e8SJason Beloro n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes); 620*9853d9e8SJason Beloro 621*9853d9e8SJason Beloro if (n_cpunodes <= 0 || n_cpunodes > NCPU) { 622*9853d9e8SJason Beloro MPO_STATUS("lgrp_traverse: No CPU nodes detected " 623*9853d9e8SJason Beloro "in MD\n"); 624*9853d9e8SJason Beloro ret_val = -1; 625*9853d9e8SJason Beloro goto fail; 626*9853d9e8SJason Beloro } 627*9853d9e8SJason Beloro 628*9853d9e8SJason Beloro MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes); 629*9853d9e8SJason Beloro 630*9853d9e8SJason Beloro if ((ret_val = lgrp_update(md, root)) == -1) 631*9853d9e8SJason Beloro goto fail; 632ce8eb11aSdp78419 633ce8eb11aSdp78419 /* 634ce8eb11aSdp78419 * Use the address mask from the first lgroup node 635ce8eb11aSdp78419 * to establish our home_mask. 636ce8eb11aSdp78419 */ 637ce8eb11aSdp78419 home_mask = mpo_lgroup[0].addr_mask; 638ce8eb11aSdp78419 home_mask_pfn = btop(home_mask); 639ce8eb11aSdp78419 home_mask_shift = lowbit(home_mask) - 1; 640ce8eb11aSdp78419 home_mask_pfn_shift = home_mask_shift - PAGESHIFT; 641ce8eb11aSdp78419 mnode_pages = btop(1ULL << home_mask_shift); 642ce8eb11aSdp78419 643ce8eb11aSdp78419 /* 644ce8eb11aSdp78419 * How many values are possible in home mask? Assume the mask 645ce8eb11aSdp78419 * bits are contiguous. 646ce8eb11aSdp78419 */ 647ce8eb11aSdp78419 max_locality_groups = 648ce8eb11aSdp78419 1 << highbit(home_mask_pfn >> home_mask_pfn_shift); 649ce8eb11aSdp78419 650*9853d9e8SJason Beloro stripe_shift = highbit(max_locality_groups) - 1; 651*9853d9e8SJason Beloro stripe = ptob(mnode_pages); 652*9853d9e8SJason Beloro stride = max_locality_groups * stripe; 653*9853d9e8SJason Beloro mnode_stride = btop(stride); 654*9853d9e8SJason Beloro 655ce8eb11aSdp78419 /* Now verify the home mask bits are contiguous */ 656ce8eb11aSdp78419 657ce8eb11aSdp78419 if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) { 658ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: " 659ce8eb11aSdp78419 "home mask bits are not contiguous\n"); 660ce8eb11aSdp78419 ret_val = -1; 661ce8eb11aSdp78419 goto fail; 662ce8eb11aSdp78419 } 663ce8eb11aSdp78419 664ce8eb11aSdp78419 /* Record all of the home bits */ 665ce8eb11aSdp78419 666ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 667ce8eb11aSdp78419 HOMESET_ADD(mem_lg_homeset, 668ce8eb11aSdp78419 mpo_lgroup[i].addr_match >> home_mask_shift); 669ce8eb11aSdp78419 } 670ce8eb11aSdp78419 671ce8eb11aSdp78419 /* Count the number different "home" mem_lg's we've discovered */ 672ce8eb11aSdp78419 673ce8eb11aSdp78419 n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset); 674ce8eb11aSdp78419 675ce8eb11aSdp78419 /* If we have only 1 locality group then we can exit */ 676ce8eb11aSdp78419 if (n_locality_groups == 1) { 677ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n"); 678ce8eb11aSdp78419 ret_val = -1; 679ce8eb11aSdp78419 goto fail; 680ce8eb11aSdp78419 } 681ce8eb11aSdp78419 682ce8eb11aSdp78419 /* 683ce8eb11aSdp78419 * Set the latencies. A CPU's lgroup is defined by the lowest 684ce8eb11aSdp78419 * latency found. All other memory is considered remote, and the 685ce8eb11aSdp78419 * remote latency is represented by the highest latency found. 686ce8eb11aSdp78419 * Thus hierarchical lgroups, if any, are approximated by a 687ce8eb11aSdp78419 * two level scheme. 688ce8eb11aSdp78419 * 689ce8eb11aSdp78419 * The Solaris MPO framework by convention wants to see latencies 690ce8eb11aSdp78419 * in units of nano-sec/10. In the MD, the units are defined to be 691ce8eb11aSdp78419 * pico-seconds. 692ce8eb11aSdp78419 */ 693ce8eb11aSdp78419 694ce8eb11aSdp78419 lower_latency = mpo_lgroup[0].latency; 695ce8eb11aSdp78419 higher_latency = mpo_lgroup[0].latency; 696ce8eb11aSdp78419 697ce8eb11aSdp78419 for (i = 1; i < n_lgrpnodes; i++) { 698ce8eb11aSdp78419 if (mpo_lgroup[i].latency < lower_latency) { 699ce8eb11aSdp78419 lower_latency = mpo_lgroup[i].latency; 700ce8eb11aSdp78419 } 701ce8eb11aSdp78419 if (mpo_lgroup[i].latency > higher_latency) { 702ce8eb11aSdp78419 higher_latency = mpo_lgroup[i].latency; 703ce8eb11aSdp78419 } 704ce8eb11aSdp78419 } 705ce8eb11aSdp78419 lower_latency /= 10000; 706ce8eb11aSdp78419 higher_latency /= 10000; 707ce8eb11aSdp78419 708ce8eb11aSdp78419 /* Clear our CPU data */ 709ce8eb11aSdp78419 710ce8eb11aSdp78419 for (i = 0; i < NCPU; i++) { 711ce8eb11aSdp78419 mpo_cpu[i].home = 0; 712924db11bSjc25722 mpo_cpu[i].lgrp_index = -1; 713ce8eb11aSdp78419 } 714ce8eb11aSdp78419 715ce8eb11aSdp78419 /* Build the CPU nodes */ 716ce8eb11aSdp78419 for (i = 0; i < n_cpunodes; i++) { 717ce8eb11aSdp78419 718ce8eb11aSdp78419 /* Read in the lgroup nodes */ 719ce8eb11aSdp78419 result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k); 720ce8eb11aSdp78419 if (result < 0) { 721ce8eb11aSdp78419 MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n"); 722ce8eb11aSdp78419 ret_val = -1; 723ce8eb11aSdp78419 goto fail; 724ce8eb11aSdp78419 } 725ce8eb11aSdp78419 726924db11bSjc25722 o = mpo_cpu_to_lgroup(md, cpunodes[i]); 727924db11bSjc25722 if (o == -1) { 728ce8eb11aSdp78419 ret_val = -1; 729ce8eb11aSdp78419 goto fail; 730ce8eb11aSdp78419 } 731924db11bSjc25722 mpo_cpu[k].lgrp_index = o; 732924db11bSjc25722 mpo_cpu[k].home = mpo_lgroup[o].addr_match >> home_mask_shift; 733ce8eb11aSdp78419 mpo_lgroup[o].ncpu++; 734ce8eb11aSdp78419 } 735ce8eb11aSdp78419 /* Validate that no large pages cross mnode boundaries. */ 736ce8eb11aSdp78419 if (valid_pages(md, cpunodes[0]) == 0) { 737ce8eb11aSdp78419 ret_val = -1; 738ce8eb11aSdp78419 goto fail; 739ce8eb11aSdp78419 } 740ce8eb11aSdp78419 741ce8eb11aSdp78419 fail: 742ce8eb11aSdp78419 if (n_cpunodes > 0) 743ce8eb11aSdp78419 md_free_scan_dag(md, &cpunodes); 744ce8eb11aSdp78419 if (n_mblocks > 0) 745ce8eb11aSdp78419 md_free_scan_dag(md, &mblocknodes); 746ce8eb11aSdp78419 else 747ce8eb11aSdp78419 panic("lgrp_traverse: No memory blocks found"); 748ce8eb11aSdp78419 749*9853d9e8SJason Beloro if (ret_val == 0) { 750ce8eb11aSdp78419 MPO_STATUS("MPO feature is enabled.\n"); 751*9853d9e8SJason Beloro } else 752*9853d9e8SJason Beloro sun4v_mpo_enable = 0; /* set this for DR */ 753ce8eb11aSdp78419 754ce8eb11aSdp78419 return (ret_val); 755ce8eb11aSdp78419 } 756ce8eb11aSdp78419 757ce8eb11aSdp78419 /* 758ce8eb11aSdp78419 * Determine the number of unique mem_lg's present in our system 759ce8eb11aSdp78419 */ 760ce8eb11aSdp78419 static int 761ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset) 762ce8eb11aSdp78419 { 763ce8eb11aSdp78419 int homeid; 764ce8eb11aSdp78419 int count = 0; 765ce8eb11aSdp78419 766ce8eb11aSdp78419 /* 767ce8eb11aSdp78419 * Scan the "home" bits of the mem_lgs, count 768ce8eb11aSdp78419 * the number that are unique. 769ce8eb11aSdp78419 */ 770ce8eb11aSdp78419 771ce8eb11aSdp78419 for (homeid = 0; homeid < NLGRPS_MAX; homeid++) { 772ce8eb11aSdp78419 if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) { 773ce8eb11aSdp78419 count++; 774ce8eb11aSdp78419 } 775ce8eb11aSdp78419 } 776ce8eb11aSdp78419 777ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n", 778ce8eb11aSdp78419 mem_lg_homeset); 779ce8eb11aSdp78419 MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count); 780ce8eb11aSdp78419 781ce8eb11aSdp78419 /* Default must be at least one */ 782ce8eb11aSdp78419 if (count == 0) 783ce8eb11aSdp78419 count = 1; 784ce8eb11aSdp78419 785ce8eb11aSdp78419 return (count); 786ce8eb11aSdp78419 } 787ce8eb11aSdp78419 788ce8eb11aSdp78419 /* 789ce8eb11aSdp78419 * Platform specific lgroup initialization 790ce8eb11aSdp78419 */ 791ce8eb11aSdp78419 void 792ce8eb11aSdp78419 plat_lgrp_init(void) 793ce8eb11aSdp78419 { 794ce8eb11aSdp78419 md_t *md; 795924db11bSjc25722 int rc; 796ce8eb11aSdp78419 797ce8eb11aSdp78419 /* Get the Machine Descriptor handle */ 798ce8eb11aSdp78419 799ce8eb11aSdp78419 md = md_get_handle(); 800ce8eb11aSdp78419 801ce8eb11aSdp78419 /* If not, we cannot continue */ 802ce8eb11aSdp78419 803ce8eb11aSdp78419 if (md == NULL) { 804ce8eb11aSdp78419 panic("cannot access machine descriptor\n"); 805ce8eb11aSdp78419 } else { 806ce8eb11aSdp78419 rc = lgrp_traverse(md); 807ce8eb11aSdp78419 (void) md_fini_handle(md); 808ce8eb11aSdp78419 } 809ce8eb11aSdp78419 810ce8eb11aSdp78419 /* 811ce8eb11aSdp78419 * If we can't process the MD for lgroups then at least let the 812ce8eb11aSdp78419 * system try to boot. Assume we have one lgroup so that 813ce8eb11aSdp78419 * when plat_build_mem_nodes is called, it will attempt to init 814ce8eb11aSdp78419 * an mnode based on the supplied memory segment. 815ce8eb11aSdp78419 */ 816ce8eb11aSdp78419 817ce8eb11aSdp78419 if (rc == -1) { 818ce8eb11aSdp78419 home_mask_pfn = 0; 819ce8eb11aSdp78419 max_locality_groups = 1; 820ce8eb11aSdp78419 n_locality_groups = 1; 821ce8eb11aSdp78419 return; 822ce8eb11aSdp78419 } 823ce8eb11aSdp78419 824ce8eb11aSdp78419 mem_node_pfn_shift = 0; 825ce8eb11aSdp78419 mem_node_physalign = 0; 826ce8eb11aSdp78419 827ce8eb11aSdp78419 /* Use lgroup-aware TSB allocations */ 828ce8eb11aSdp78419 tsb_lgrp_affinity = 1; 829ce8eb11aSdp78419 830ce8eb11aSdp78419 /* Require that a home lgroup have some memory to be chosen */ 831ce8eb11aSdp78419 lgrp_mem_free_thresh = 1; 832ce8eb11aSdp78419 833ce8eb11aSdp78419 /* Standard home-on-next-touch policy */ 834ce8eb11aSdp78419 lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT; 835ce8eb11aSdp78419 836ce8eb11aSdp78419 /* Disable option to choose root lgroup if all leaf lgroups are busy */ 837ce8eb11aSdp78419 lgrp_load_thresh = UINT32_MAX; 838924db11bSjc25722 839924db11bSjc25722 mpo_update_tunables(); 840ce8eb11aSdp78419 } 841ce8eb11aSdp78419 842ce8eb11aSdp78419 /* 843ce8eb11aSdp78419 * Helper routine for debugging calls to mem_node_add_slice() 844ce8eb11aSdp78419 */ 845ce8eb11aSdp78419 static void 846ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn) 847ce8eb11aSdp78419 { 848ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint) 849ce8eb11aSdp78419 static int slice_count = 0; 850ce8eb11aSdp78419 851ce8eb11aSdp78419 slice_count++; 852ce8eb11aSdp78419 MPO_DEBUG("mem_add_slice(%d): basepfn: %lx endpfn: %lx\n", 853ce8eb11aSdp78419 slice_count, basepfn, endpfn); 854ce8eb11aSdp78419 #endif 855ce8eb11aSdp78419 mem_node_add_slice(basepfn, endpfn); 856ce8eb11aSdp78419 } 857ce8eb11aSdp78419 858*9853d9e8SJason Beloro static void 859*9853d9e8SJason Beloro mpo_mem_node_del_slice(pfn_t basepfn, pfn_t endpfn) 860*9853d9e8SJason Beloro { 861*9853d9e8SJason Beloro #if defined(DEBUG) && !defined(lint) 862*9853d9e8SJason Beloro static int slice_count = 0; 863*9853d9e8SJason Beloro 864*9853d9e8SJason Beloro slice_count++; 865*9853d9e8SJason Beloro MPO_DEBUG("mem_del_slice(%d): basepfn: %lx endpfn: %lx\n", 866*9853d9e8SJason Beloro slice_count, basepfn, endpfn); 867*9853d9e8SJason Beloro #endif 868*9853d9e8SJason Beloro mem_node_del_slice(basepfn, endpfn); 869*9853d9e8SJason Beloro } 870*9853d9e8SJason Beloro 871ce8eb11aSdp78419 /* 872ce8eb11aSdp78419 * Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node() 873ce8eb11aSdp78419 */ 874ce8eb11aSdp78419 static void 875ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode) 876ce8eb11aSdp78419 { 877ce8eb11aSdp78419 MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld, " 878ce8eb11aSdp78419 "mnode index: %d\n", plathand, mnode); 879ce8eb11aSdp78419 plat_assign_lgrphand_to_mem_node(plathand, mnode); 880ce8eb11aSdp78419 } 881ce8eb11aSdp78419 882ce8eb11aSdp78419 /* 883ce8eb11aSdp78419 * plat_build_mem_nodes() 884ce8eb11aSdp78419 * 885ce8eb11aSdp78419 * Define the mem_nodes based on the modified boot memory list, 886ce8eb11aSdp78419 * or based on info read from the MD in plat_lgrp_init(). 887ce8eb11aSdp78419 * 888ce8eb11aSdp78419 * When the home mask lies in the middle of the address bits (as it does on 889ce8eb11aSdp78419 * Victoria Falls), then the memory in one mem_node is no longer contiguous; 890ce8eb11aSdp78419 * it is striped across an mblock in a repeating pattern of contiguous memory 891ce8eb11aSdp78419 * followed by a gap. The stripe width is the size of the contiguous piece. 892ce8eb11aSdp78419 * The stride is the distance from the start of one contiguous piece to the 893ce8eb11aSdp78419 * start of the next. The gap is thus stride - stripe_width. 894ce8eb11aSdp78419 * 895ce8eb11aSdp78419 * The stripe of an mnode that falls within an mblock is described by the type 896ce8eb11aSdp78419 * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock. The 897ce8eb11aSdp78419 * mem_stripe_t's are kept in a global array mem_stripes[]. The index into 898ce8eb11aSdp78419 * this array is predetermined. The mem_stripe_t that describes mnode m 899ce8eb11aSdp78419 * within mpo_mblock[i] is stored at 900ce8eb11aSdp78419 * mem_stripes[ m + i * max_locality_groups ] 901ce8eb11aSdp78419 * 902ce8eb11aSdp78419 * max_locality_groups is the total number of possible locality groups, 903ce8eb11aSdp78419 * as defined by the size of the home mask, even if the memory assigned 904ce8eb11aSdp78419 * to the domain is small and does not cover all the lgroups. Thus some 905ce8eb11aSdp78419 * mem_stripe_t's may be empty. 906ce8eb11aSdp78419 * 907ce8eb11aSdp78419 * The members of mem_stripe_t are: 908ce8eb11aSdp78419 * physbase: First valid page in mem_node in the corresponding mblock 909ce8eb11aSdp78419 * physmax: Last valid page in mem_node in mblock 910ce8eb11aSdp78419 * offset: The full stripe width starts at physbase - offset. 911ce8eb11aSdp78419 * Thus if offset is non-zero, this mem_node starts in the middle 912ce8eb11aSdp78419 * of a stripe width, and the second full stripe starts at 913ce8eb11aSdp78419 * physbase - offset + stride. (even though physmax may fall in the 914ce8eb11aSdp78419 * middle of a stripe width, we do not save the ending fragment size 915ce8eb11aSdp78419 * in this data structure.) 916ce8eb11aSdp78419 * exists: Set to 1 if the mblock has memory in this mem_node stripe. 917ce8eb11aSdp78419 * 918ce8eb11aSdp78419 * The stripe width is kept in the global mnode_pages. 919ce8eb11aSdp78419 * The stride is kept in the global mnode_stride. 920ce8eb11aSdp78419 * All the above use pfn's as the unit. 921ce8eb11aSdp78419 * 922ce8eb11aSdp78419 * As an example, the memory layout for a domain with 2 mblocks and 4 923ce8eb11aSdp78419 * mem_nodes 0,1,2,3 could look like this: 924ce8eb11aSdp78419 * 925ce8eb11aSdp78419 * 123012301230 ... 012301230123 ... 926ce8eb11aSdp78419 * mblock 0 mblock 1 927ce8eb11aSdp78419 */ 928ce8eb11aSdp78419 929*9853d9e8SJason Beloro /*ARGSUSED*/ 930ce8eb11aSdp78419 void 931986fd29aSsetje plat_build_mem_nodes(prom_memlist_t *list, size_t nelems) 932ce8eb11aSdp78419 { 933*9853d9e8SJason Beloro int elem; 934*9853d9e8SJason Beloro uint64_t base, len; 935ce8eb11aSdp78419 936e853d8c3Sjc25722 /* Pre-reserve space for plat_assign_lgrphand_to_mem_node */ 937e853d8c3Sjc25722 max_mem_nodes = max_locality_groups; 938ce8eb11aSdp78419 939*9853d9e8SJason Beloro mstripe_update(&mpo_config); 940*9853d9e8SJason Beloro 941e853d8c3Sjc25722 /* Check for non-MPO sun4v platforms */ 942ce8eb11aSdp78419 if (n_locality_groups <= 1) { 943e853d8c3Sjc25722 mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0); 944986fd29aSsetje for (elem = 0; elem < nelems; list++, elem++) { 945986fd29aSsetje base = list->addr; 946986fd29aSsetje len = list->size; 947ce8eb11aSdp78419 948ce8eb11aSdp78419 mpo_mem_node_add_slice(btop(base), 949ce8eb11aSdp78419 btop(base + len - 1)); 950ce8eb11aSdp78419 } 951ce8eb11aSdp78419 mem_node_pfn_shift = 0; 952ce8eb11aSdp78419 mem_node_physalign = 0; 953*9853d9e8SJason Beloro } else 954*9853d9e8SJason Beloro mnode_update(&mpo_config, 0, 0, U_ADD_ALL); 955ce8eb11aSdp78419 956ce8eb11aSdp78419 /* 957ce8eb11aSdp78419 * Indicate to vm_pagelist that the hpm_counters array 958ce8eb11aSdp78419 * should be shared because the ranges overlap. 959ce8eb11aSdp78419 */ 960ce8eb11aSdp78419 if (max_mem_nodes > 1) { 961ce8eb11aSdp78419 interleaved_mnodes = 1; 962ce8eb11aSdp78419 } 963ce8eb11aSdp78419 } 964ce8eb11aSdp78419 965ce8eb11aSdp78419 /* 966ce8eb11aSdp78419 * Return the locality group value for the supplied processor 967ce8eb11aSdp78419 */ 968ce8eb11aSdp78419 lgrp_handle_t 969ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id) 970ce8eb11aSdp78419 { 971*9853d9e8SJason Beloro lgrp_handle_t lgrphand; 972*9853d9e8SJason Beloro 973*9853d9e8SJason Beloro mpo_rd_lock(); 974ce8eb11aSdp78419 if (n_locality_groups > 1) { 975*9853d9e8SJason Beloro lgrphand = (lgrp_handle_t)mpo_cpu[(int)id].home; 976ce8eb11aSdp78419 } else { 977*9853d9e8SJason Beloro lgrphand = (lgrp_handle_t)LGRP_DEFAULT_HANDLE; /* Default */ 978ce8eb11aSdp78419 } 979*9853d9e8SJason Beloro mpo_rd_unlock(); 980*9853d9e8SJason Beloro 981*9853d9e8SJason Beloro return (lgrphand); 982ce8eb11aSdp78419 } 983ce8eb11aSdp78419 984ce8eb11aSdp78419 int 985ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to) 986ce8eb11aSdp78419 { 987ce8eb11aSdp78419 /* 988ce8eb11aSdp78419 * Return min remote latency when there are more than two lgroups 989ce8eb11aSdp78419 * (root and child) and getting latency between two different lgroups 990ce8eb11aSdp78419 * or root is involved. 991ce8eb11aSdp78419 */ 992ce8eb11aSdp78419 if (lgrp_optimizations() && (from != to || 993ce8eb11aSdp78419 from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) { 994ce8eb11aSdp78419 return ((int)higher_latency); 995ce8eb11aSdp78419 } else { 996ce8eb11aSdp78419 return ((int)lower_latency); 997ce8eb11aSdp78419 } 998ce8eb11aSdp78419 } 999ce8eb11aSdp78419 1000ce8eb11aSdp78419 int 1001ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn) 1002ce8eb11aSdp78419 { 1003ce8eb11aSdp78419 int i, mnode; 1004ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 1005ce8eb11aSdp78419 struct mblock_md *mb; 1006ce8eb11aSdp78419 1007ce8eb11aSdp78419 if (n_locality_groups <= 1) 1008ce8eb11aSdp78419 return (0); 1009ce8eb11aSdp78419 1010ce8eb11aSdp78419 /* 1011ce8eb11aSdp78419 * The mnode is defined to be 1:1 with the lgroup handle, which 1012ce8eb11aSdp78419 * is taken from from the home bits. Find the mblock in which 1013ce8eb11aSdp78419 * the pfn falls to get the ra_to_pa adjustment, and extract 1014ce8eb11aSdp78419 * the home bits. 1015ce8eb11aSdp78419 */ 1016*9853d9e8SJason Beloro mpo_rd_lock(); 1017ce8eb11aSdp78419 mb = &mpo_mblock[0]; 1018ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1019ce8eb11aSdp78419 if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) { 1020ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 1021ce8eb11aSdp78419 mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >> 1022ce8eb11aSdp78419 home_mask_pfn_shift); 1023ce8eb11aSdp78419 ASSERT(mnode < max_mem_nodes); 1024*9853d9e8SJason Beloro mpo_rd_unlock(); 1025ce8eb11aSdp78419 return (mnode); 1026ce8eb11aSdp78419 } 1027ce8eb11aSdp78419 mb++; 1028ce8eb11aSdp78419 } 1029ce8eb11aSdp78419 1030ce8eb11aSdp78419 panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn); 1031ce8eb11aSdp78419 return (pfn); 1032ce8eb11aSdp78419 } 1033ce8eb11aSdp78419 1034ce8eb11aSdp78419 /* 1035ce8eb11aSdp78419 * plat_rapfn_to_papfn 1036ce8eb11aSdp78419 * 1037ce8eb11aSdp78419 * Convert a pfn in RA space to a pfn in PA space, in which the page coloring 1038ce8eb11aSdp78419 * and home mask bits are correct. The upper bits do not necessarily 1039ce8eb11aSdp78419 * match the actual PA, however. 1040ce8eb11aSdp78419 */ 1041ce8eb11aSdp78419 pfn_t 1042ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn) 1043ce8eb11aSdp78419 { 1044ce8eb11aSdp78419 int i; 1045ce8eb11aSdp78419 pfn_t ra_to_pa_pfn; 1046ce8eb11aSdp78419 struct mblock_md *mb; 1047ce8eb11aSdp78419 1048ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 1049ce8eb11aSdp78419 if (n_mblocks == 1) 1050ce8eb11aSdp78419 return (pfn + base_ra_to_pa_pfn); 1051ce8eb11aSdp78419 1052ce8eb11aSdp78419 /* 1053ce8eb11aSdp78419 * Find the mblock in which the pfn falls 1054ce8eb11aSdp78419 * in order to get the ra_to_pa adjustment. 1055ce8eb11aSdp78419 */ 1056*9853d9e8SJason Beloro mpo_rd_lock(); 1057ce8eb11aSdp78419 for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) { 1058ce8eb11aSdp78419 if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) { 1059ce8eb11aSdp78419 ra_to_pa_pfn = btop(mb->ra_to_pa); 1060*9853d9e8SJason Beloro mpo_rd_unlock(); 1061ce8eb11aSdp78419 return (pfn + ra_to_pa_pfn); 1062ce8eb11aSdp78419 } 1063ce8eb11aSdp78419 } 1064ce8eb11aSdp78419 1065ce8eb11aSdp78419 panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn); 1066ce8eb11aSdp78419 return (pfn); 1067ce8eb11aSdp78419 } 1068ce8eb11aSdp78419 1069ce8eb11aSdp78419 /* 1070ce8eb11aSdp78419 * plat_mem_node_iterator_init() 1071b779d3e0Sdp78419 * Initialize cookie "it" to iterate over pfn's in an mnode. There is 1072ce8eb11aSdp78419 * no additional iterator function. The caller uses the info from 1073ce8eb11aSdp78419 * the iterator structure directly. 1074ce8eb11aSdp78419 * 1075ce8eb11aSdp78419 * pfn: starting pfn. 1076ce8eb11aSdp78419 * mnode: desired mnode. 1077b779d3e0Sdp78419 * szc: desired page size. 1078b779d3e0Sdp78419 * init: 1079b779d3e0Sdp78419 * if 1, start a new traversal, initialize "it", find first 1080b779d3e0Sdp78419 * mblock containing pfn, and return its starting pfn 1081b779d3e0Sdp78419 * within the mnode. 1082b779d3e0Sdp78419 * if 0, continue the previous traversal using passed-in data 1083b779d3e0Sdp78419 * from "it", advance to the next mblock, and return its 1084b779d3e0Sdp78419 * starting pfn within the mnode. 1085b779d3e0Sdp78419 * it: returns readonly data to the caller; see below. 1086ce8eb11aSdp78419 * 1087b779d3e0Sdp78419 * The input pfn must be aligned for the page size szc. 1088b779d3e0Sdp78419 * 1089b779d3e0Sdp78419 * Returns: starting pfn for the iteration for the mnode/mblock, 1090b779d3e0Sdp78419 * which is aligned according to the page size, 1091b779d3e0Sdp78419 * or returns (pfn_t)(-1) if the input pfn lies past the last 1092b779d3e0Sdp78419 * valid pfn of the mnode. 1093b779d3e0Sdp78419 * Returns misc values in the "it" struct that allows the caller 1094b779d3e0Sdp78419 * to advance the pfn within an mblock using address arithmetic; 1095b779d3e0Sdp78419 * see definition of mem_node_iterator_t in vm_dep.h. 1096b779d3e0Sdp78419 * When the caller calculates a pfn that is greater than the 1097b779d3e0Sdp78419 * returned value it->mi_mblock_end, the caller should again 1098b779d3e0Sdp78419 * call plat_mem_node_iterator_init, passing init=0. 1099*9853d9e8SJason Beloro * 1100*9853d9e8SJason Beloro * The last mblock in continuation case may be invalid because 1101*9853d9e8SJason Beloro * of memory DR. To detect this situation mi_genid is checked 1102*9853d9e8SJason Beloro * against mpo_genid which is incremented after a memory DR 1103*9853d9e8SJason Beloro * operation. See also plat_slice_add()/plat_slice_del(). 1104ce8eb11aSdp78419 */ 1105ce8eb11aSdp78419 pfn_t 1106b779d3e0Sdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode, uchar_t szc, 1107ce8eb11aSdp78419 mem_node_iterator_t *it, int init) 1108ce8eb11aSdp78419 { 1109ce8eb11aSdp78419 int i; 1110b779d3e0Sdp78419 pgcnt_t szcpgcnt = PNUM_SIZE(szc); 1111ce8eb11aSdp78419 struct mblock_md *mblock; 1112ce8eb11aSdp78419 pfn_t base, end; 1113b779d3e0Sdp78419 mem_stripe_t *ms; 1114b779d3e0Sdp78419 uint64_t szcpagesize; 1115ce8eb11aSdp78419 1116ce8eb11aSdp78419 ASSERT(it != NULL); 1117ce8eb11aSdp78419 ASSERT(mnode >= 0 && mnode < max_mem_nodes); 1118ce8eb11aSdp78419 ASSERT(n_mblocks > 0); 1119b779d3e0Sdp78419 ASSERT(P2PHASE(pfn, szcpgcnt) == 0); 1120ce8eb11aSdp78419 1121*9853d9e8SJason Beloro mpo_rd_lock(); 1122*9853d9e8SJason Beloro 1123*9853d9e8SJason Beloro if (init || (it->mi_genid != mpo_genid)) { 1124*9853d9e8SJason Beloro it->mi_genid = mpo_genid; 1125ce8eb11aSdp78419 it->mi_last_mblock = 0; 1126ce8eb11aSdp78419 it->mi_init = 1; 1127ce8eb11aSdp78419 } 1128ce8eb11aSdp78419 1129ce8eb11aSdp78419 /* Check if mpo is not enabled and we only have one mblock */ 1130ce8eb11aSdp78419 if (n_locality_groups == 1 && n_mblocks == 1) { 1131*9853d9e8SJason Beloro if (P2PHASE(base_ra_to_pa_pfn, szcpgcnt)) { 1132*9853d9e8SJason Beloro pfn = (pfn_t)-1; 1133*9853d9e8SJason Beloro goto done; 1134*9853d9e8SJason Beloro } 1135ce8eb11aSdp78419 it->mi_mnode = mnode; 1136ce8eb11aSdp78419 it->mi_ra_to_pa = base_ra_to_pa_pfn; 1137ce8eb11aSdp78419 it->mi_mnode_pfn_mask = 0; 1138ce8eb11aSdp78419 it->mi_mnode_pfn_shift = 0; 1139ce8eb11aSdp78419 it->mi_mnode_mask = 0; 1140ce8eb11aSdp78419 it->mi_mblock_base = mem_node_config[mnode].physbase; 1141ce8eb11aSdp78419 it->mi_mblock_end = mem_node_config[mnode].physmax; 1142ce8eb11aSdp78419 if (pfn < it->mi_mblock_base) 1143b779d3e0Sdp78419 pfn = P2ROUNDUP(it->mi_mblock_base, szcpgcnt); 1144b779d3e0Sdp78419 if ((pfn + szcpgcnt - 1) > it->mi_mblock_end) 1145ce8eb11aSdp78419 pfn = (pfn_t)-1; 1146*9853d9e8SJason Beloro goto done; 1147ce8eb11aSdp78419 } 1148ce8eb11aSdp78419 1149b779d3e0Sdp78419 /* init=1 means begin iterator, init=0 means continue */ 1150b779d3e0Sdp78419 if (init == 1) { 1151b779d3e0Sdp78419 i = 0; 1152b779d3e0Sdp78419 } else { 1153ce8eb11aSdp78419 ASSERT(it->mi_last_mblock < n_mblocks); 1154b779d3e0Sdp78419 i = it->mi_last_mblock; 1155b779d3e0Sdp78419 ASSERT(pfn > 1156b779d3e0Sdp78419 mem_stripes[i * max_locality_groups + mnode].physmax); 1157*9853d9e8SJason Beloro if (++i == n_mblocks) { 1158*9853d9e8SJason Beloro pfn = (pfn_t)-1; 1159*9853d9e8SJason Beloro goto done; 1160*9853d9e8SJason Beloro } 1161b779d3e0Sdp78419 } 1162ce8eb11aSdp78419 1163b779d3e0Sdp78419 /* 1164b779d3e0Sdp78419 * Find mblock that contains pfn for mnode's stripe, or first such an 1165b779d3e0Sdp78419 * mblock after pfn, else pfn is out of bound and we'll return -1. 1166b779d3e0Sdp78419 * mblocks and stripes are sorted in ascending address order. 1167b779d3e0Sdp78419 */ 1168b779d3e0Sdp78419 szcpagesize = szcpgcnt << PAGESHIFT; 1169ce8eb11aSdp78419 for (; i < n_mblocks; i++) { 1170b779d3e0Sdp78419 if (P2PHASE(mpo_mblock[i].ra_to_pa, szcpagesize)) 1171b779d3e0Sdp78419 continue; 1172b779d3e0Sdp78419 ms = &mem_stripes[i * max_locality_groups + mnode]; 1173b779d3e0Sdp78419 if (ms->exists && (pfn + szcpgcnt - 1) <= ms->physmax && 1174b779d3e0Sdp78419 (P2ROUNDUP(ms->physbase, szcpgcnt) + szcpgcnt - 1) <= 1175b779d3e0Sdp78419 ms->physmax) 1176ce8eb11aSdp78419 break; 1177ce8eb11aSdp78419 } 1178ce8eb11aSdp78419 if (i == n_mblocks) { 1179ce8eb11aSdp78419 it->mi_last_mblock = i - 1; 1180*9853d9e8SJason Beloro pfn = (pfn_t)-1; 1181*9853d9e8SJason Beloro goto done; 1182ce8eb11aSdp78419 } 1183b779d3e0Sdp78419 1184ce8eb11aSdp78419 it->mi_last_mblock = i; 1185ce8eb11aSdp78419 1186ce8eb11aSdp78419 mblock = &mpo_mblock[i]; 1187ce8eb11aSdp78419 base = ms->physbase; 1188ce8eb11aSdp78419 end = ms->physmax; 1189ce8eb11aSdp78419 1190ce8eb11aSdp78419 it->mi_mnode = mnode; 1191ce8eb11aSdp78419 it->mi_ra_to_pa = btop(mblock->ra_to_pa); 1192ce8eb11aSdp78419 it->mi_mblock_base = base; 1193ce8eb11aSdp78419 it->mi_mblock_end = end; 1194ce8eb11aSdp78419 it->mi_mnode_pfn_mask = home_mask_pfn; /* is 0 for non-MPO case */ 1195ce8eb11aSdp78419 it->mi_mnode_pfn_shift = home_mask_pfn_shift; 1196ce8eb11aSdp78419 it->mi_mnode_mask = max_locality_groups - 1; 1197b779d3e0Sdp78419 if (pfn < base) { 1198b779d3e0Sdp78419 pfn = P2ROUNDUP(base, szcpgcnt); 1199b779d3e0Sdp78419 ASSERT(pfn + szcpgcnt - 1 <= end); 1200b779d3e0Sdp78419 } 1201b779d3e0Sdp78419 ASSERT((pfn + szcpgcnt - 1) <= mpo_mblock[i].end_pfn); 1202*9853d9e8SJason Beloro done: 1203*9853d9e8SJason Beloro mpo_rd_unlock(); 1204ce8eb11aSdp78419 return (pfn); 1205ce8eb11aSdp78419 } 1206ce8eb11aSdp78419 1207ce8eb11aSdp78419 /* 1208ce8eb11aSdp78419 * plat_mem_node_intersect_range() 1209ce8eb11aSdp78419 * 1210ce8eb11aSdp78419 * Find the intersection between a memnode and a range of pfn's. 1211ce8eb11aSdp78419 */ 1212ce8eb11aSdp78419 void 1213ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len, 1214ce8eb11aSdp78419 int mnode, pgcnt_t *npages_out) 1215ce8eb11aSdp78419 { 1216ce8eb11aSdp78419 pfn_t offset, len, hole, base, end, test_end, frag; 1217ce8eb11aSdp78419 pfn_t nearest; 1218ce8eb11aSdp78419 mem_stripe_t *ms; 1219ce8eb11aSdp78419 int i, npages; 1220ce8eb11aSdp78419 1221ce8eb11aSdp78419 *npages_out = 0; 1222ce8eb11aSdp78419 1223ce8eb11aSdp78419 if (!mem_node_config[mnode].exists || test_len == 0) 1224ce8eb11aSdp78419 return; 1225ce8eb11aSdp78419 1226ce8eb11aSdp78419 base = mem_node_config[mnode].physbase; 1227ce8eb11aSdp78419 end = mem_node_config[mnode].physmax; 1228ce8eb11aSdp78419 1229ce8eb11aSdp78419 test_end = test_base + test_len - 1; 1230ce8eb11aSdp78419 if (end < test_base || base > test_end) 1231ce8eb11aSdp78419 return; 1232ce8eb11aSdp78419 1233ce8eb11aSdp78419 if (n_locality_groups == 1) { 1234ce8eb11aSdp78419 *npages_out = MIN(test_end, end) - MAX(test_base, base) + 1; 1235ce8eb11aSdp78419 return; 1236ce8eb11aSdp78419 } 1237ce8eb11aSdp78419 1238ce8eb11aSdp78419 hole = mnode_stride - mnode_pages; 1239ce8eb11aSdp78419 npages = 0; 1240ce8eb11aSdp78419 1241ce8eb11aSdp78419 /* 1242ce8eb11aSdp78419 * Iterate over all the stripes for this mnode (one per mblock), 1243ce8eb11aSdp78419 * find the intersection with each, and accumulate the intersections. 1244ce8eb11aSdp78419 * 1245ce8eb11aSdp78419 * Determing the intersection with a stripe is tricky. If base or end 1246ce8eb11aSdp78419 * fall outside the mem_node bounds, round them to physbase/physmax of 1247ce8eb11aSdp78419 * mem_node. If base or end fall in a gap, round them to start of 1248ce8eb11aSdp78419 * nearest stripe. If they fall within a stripe, keep base or end, 1249ce8eb11aSdp78419 * but calculate the fragment size that should be excluded from the 1250ce8eb11aSdp78419 * stripe. Calculate how many strides fall in the adjusted range, 1251ce8eb11aSdp78419 * multiply by stripe width, and add the start and end fragments. 1252ce8eb11aSdp78419 */ 1253ce8eb11aSdp78419 1254*9853d9e8SJason Beloro mpo_rd_lock(); 1255ce8eb11aSdp78419 for (i = mnode; i < n_mem_stripes; i += max_locality_groups) { 1256ce8eb11aSdp78419 ms = &mem_stripes[i]; 1257ce8eb11aSdp78419 if (ms->exists && 1258ce8eb11aSdp78419 test_base <= (end = ms->physmax) && 1259ce8eb11aSdp78419 test_end >= (base = ms->physbase)) { 1260ce8eb11aSdp78419 1261ce8eb11aSdp78419 offset = ms->offset; 1262ce8eb11aSdp78419 1263ce8eb11aSdp78419 if (test_base > base) { 1264ce8eb11aSdp78419 /* Round test_base to next multiple of stride */ 1265ce8eb11aSdp78419 len = P2ROUNDUP(test_base - (base - offset), 1266ce8eb11aSdp78419 mnode_stride); 1267ce8eb11aSdp78419 nearest = base - offset + len; 1268ce8eb11aSdp78419 /* 1269ce8eb11aSdp78419 * Compute distance from test_base to the 1270ce8eb11aSdp78419 * stride boundary to see if test_base falls 1271ce8eb11aSdp78419 * in the stripe or in the hole. 1272ce8eb11aSdp78419 */ 1273ce8eb11aSdp78419 if (nearest - test_base > hole) { 1274ce8eb11aSdp78419 /* 1275ce8eb11aSdp78419 * test_base lies in stripe, 1276ce8eb11aSdp78419 * and offset should be excluded. 1277ce8eb11aSdp78419 */ 1278ce8eb11aSdp78419 offset = test_base - 1279ce8eb11aSdp78419 (nearest - mnode_stride); 1280ce8eb11aSdp78419 base = test_base; 1281ce8eb11aSdp78419 } else { 1282ce8eb11aSdp78419 /* round up to next stripe start */ 1283ce8eb11aSdp78419 offset = 0; 1284ce8eb11aSdp78419 base = nearest; 1285ce8eb11aSdp78419 if (base > end) 1286ce8eb11aSdp78419 continue; 1287ce8eb11aSdp78419 } 1288ce8eb11aSdp78419 1289ce8eb11aSdp78419 } 1290ce8eb11aSdp78419 1291ce8eb11aSdp78419 if (test_end < end) 1292ce8eb11aSdp78419 end = test_end; 1293ce8eb11aSdp78419 end++; /* adjust to an exclusive bound */ 1294ce8eb11aSdp78419 1295ce8eb11aSdp78419 /* Round end to next multiple of stride */ 1296ce8eb11aSdp78419 len = P2ROUNDUP(end - (base - offset), mnode_stride); 1297ce8eb11aSdp78419 nearest = (base - offset) + len; 1298ce8eb11aSdp78419 if (nearest - end <= hole) { 1299ce8eb11aSdp78419 /* end falls in hole, use entire last stripe */ 1300ce8eb11aSdp78419 frag = 0; 1301ce8eb11aSdp78419 } else { 1302ce8eb11aSdp78419 /* end falls in stripe, compute fragment */ 1303ce8eb11aSdp78419 frag = nearest - hole - end; 1304ce8eb11aSdp78419 } 1305ce8eb11aSdp78419 1306ce8eb11aSdp78419 len = (len >> stripe_shift) - offset - frag; 1307ce8eb11aSdp78419 npages += len; 1308ce8eb11aSdp78419 } 1309ce8eb11aSdp78419 } 1310ce8eb11aSdp78419 1311ce8eb11aSdp78419 *npages_out = npages; 1312*9853d9e8SJason Beloro mpo_rd_unlock(); 1313ce8eb11aSdp78419 } 1314ce8eb11aSdp78419 1315ce8eb11aSdp78419 /* 1316ce8eb11aSdp78419 * valid_pages() 1317ce8eb11aSdp78419 * 1318ce8eb11aSdp78419 * Return 1 if pages are valid and do not cross mnode boundaries 1319ce8eb11aSdp78419 * (which would break page free list assumptions), and 0 otherwise. 1320ce8eb11aSdp78419 */ 1321ce8eb11aSdp78419 1322ce8eb11aSdp78419 #define MNODE(pa) \ 1323ce8eb11aSdp78419 ((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift) 1324ce8eb11aSdp78419 1325ce8eb11aSdp78419 static int 1326ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0) 1327ce8eb11aSdp78419 { 1328ce8eb11aSdp78419 int i, max_szc; 1329ce8eb11aSdp78419 uint64_t last_page_base, szc_mask; 1330ce8eb11aSdp78419 uint64_t max_page_len, max_coalesce_len; 1331ce8eb11aSdp78419 struct mblock_md *mb = mpo_mblock; 1332ce8eb11aSdp78419 1333ce8eb11aSdp78419 /* 1334ce8eb11aSdp78419 * Find the smaller of the largest page possible and supported. 1335ce8eb11aSdp78419 * mmu_exported_pagesize_mask is not yet initialized, so read 1336ce8eb11aSdp78419 * it from the MD. Apply minimal fixups in case of broken MDs 1337ce8eb11aSdp78419 * to get a sane mask. 1338ce8eb11aSdp78419 */ 1339ce8eb11aSdp78419 1340*9853d9e8SJason Beloro if (cpu0 == NULL) 1341*9853d9e8SJason Beloro szc_mask = szc_mask0; 1342*9853d9e8SJason Beloro else { 1343ce8eb11aSdp78419 if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask)) 1344ce8eb11aSdp78419 szc_mask = 0; 1345*9853d9e8SJason Beloro /* largest in sun4v default support */ 1346*9853d9e8SJason Beloro szc_mask |= (1 << TTE4M); 1347*9853d9e8SJason Beloro szc_mask0 = szc_mask; 1348*9853d9e8SJason Beloro } 1349ce8eb11aSdp78419 max_szc = highbit(szc_mask) - 1; 1350ce8eb11aSdp78419 if (max_szc > TTE256M) 1351ce8eb11aSdp78419 max_szc = TTE256M; 1352ce8eb11aSdp78419 max_page_len = TTEBYTES(max_szc); 1353ce8eb11aSdp78419 1354ce8eb11aSdp78419 /* 1355ce8eb11aSdp78419 * Page coalescing code coalesces all sizes up to 256M on sun4v, even 1356ce8eb11aSdp78419 * if mmu-page-size-list does not contain it, so 256M pages must fall 1357ce8eb11aSdp78419 * within one mnode to use MPO. 1358ce8eb11aSdp78419 */ 1359ce8eb11aSdp78419 max_coalesce_len = TTEBYTES(TTE256M); 1360ce8eb11aSdp78419 ASSERT(max_coalesce_len >= max_page_len); 1361ce8eb11aSdp78419 1362ce8eb11aSdp78419 if (ptob(mnode_pages) < max_coalesce_len) { 1363ce8eb11aSdp78419 MPO_STATUS("Page too large; MPO disabled: page = %lx, " 1364ce8eb11aSdp78419 "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages)); 1365ce8eb11aSdp78419 return (0); 1366ce8eb11aSdp78419 } 1367ce8eb11aSdp78419 1368ce8eb11aSdp78419 for (i = 0; i < n_mblocks; i++) { 1369ce8eb11aSdp78419 uint64_t base = mb->base; 1370ce8eb11aSdp78419 uint64_t end = mb->base + mb->size - 1; 1371ce8eb11aSdp78419 uint64_t ra_to_pa = mb->ra_to_pa; 1372ce8eb11aSdp78419 1373ce8eb11aSdp78419 /* 1374ce8eb11aSdp78419 * If mblock is smaller than the max page size, then 1375ce8eb11aSdp78419 * RA = PA mod MAXPAGE is not guaranteed, but it must 1376ce8eb11aSdp78419 * not span mnodes. 1377ce8eb11aSdp78419 */ 1378ce8eb11aSdp78419 if (mb->size < max_page_len) { 1379ce8eb11aSdp78419 if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) { 1380ce8eb11aSdp78419 MPO_STATUS("Small mblock spans mnodes; " 1381ce8eb11aSdp78419 "MPO disabled: base = %lx, end = %lx, " 1382ce8eb11aSdp78419 "ra2pa = %lx\n", base, end, ra_to_pa); 1383ce8eb11aSdp78419 return (0); 1384ce8eb11aSdp78419 } 1385ce8eb11aSdp78419 } else { 1386ce8eb11aSdp78419 /* Verify RA = PA mod MAXPAGE, using coalesce size */ 1387ce8eb11aSdp78419 uint64_t pa_base = base + ra_to_pa; 1388ce8eb11aSdp78419 if ((base & (max_coalesce_len - 1)) != 1389ce8eb11aSdp78419 (pa_base & (max_coalesce_len - 1))) { 1390ce8eb11aSdp78419 MPO_STATUS("bad page alignment; MPO disabled: " 1391ce8eb11aSdp78419 "ra = %lx, pa = %lx, pagelen = %lx\n", 1392ce8eb11aSdp78419 base, pa_base, max_coalesce_len); 1393ce8eb11aSdp78419 return (0); 1394ce8eb11aSdp78419 } 1395ce8eb11aSdp78419 } 1396ce8eb11aSdp78419 1397ce8eb11aSdp78419 /* 1398ce8eb11aSdp78419 * Find start of last large page in mblock in RA space. 1399ce8eb11aSdp78419 * If page extends into the next mblock, verify the 1400ce8eb11aSdp78419 * mnode does not change. 1401ce8eb11aSdp78419 */ 1402ce8eb11aSdp78419 last_page_base = P2ALIGN(end, max_coalesce_len); 1403ce8eb11aSdp78419 if (i + 1 < n_mblocks && 1404ce8eb11aSdp78419 last_page_base + max_coalesce_len > mb[1].base && 1405ce8eb11aSdp78419 MNODE(last_page_base + ra_to_pa) != 1406ce8eb11aSdp78419 MNODE(mb[1].base + mb[1].ra_to_pa)) { 1407ce8eb11aSdp78419 MPO_STATUS("Large page spans mblocks; MPO disabled: " 1408ce8eb11aSdp78419 "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, " 1409ce8eb11aSdp78419 "pagelen = %lx\n", end, ra_to_pa, mb[1].base, 1410ce8eb11aSdp78419 mb[1].ra_to_pa, max_coalesce_len); 1411ce8eb11aSdp78419 return (0); 1412ce8eb11aSdp78419 } 1413ce8eb11aSdp78419 1414ce8eb11aSdp78419 mb++; 1415ce8eb11aSdp78419 } 1416ce8eb11aSdp78419 return (1); 1417ce8eb11aSdp78419 } 1418ce8eb11aSdp78419 1419ce8eb11aSdp78419 1420ce8eb11aSdp78419 /* 1421ce8eb11aSdp78419 * fix_interleave() - Find lgroups with sub-page sized memory interleave, 1422ce8eb11aSdp78419 * if any, and remove them. This yields a config where the "coarse 1423ce8eb11aSdp78419 * grained" lgroups cover all of memory, even though part of that memory 1424ce8eb11aSdp78419 * is fine grain interleaved and does not deliver a purely local memory 1425ce8eb11aSdp78419 * latency. 1426ce8eb11aSdp78419 * 1427ce8eb11aSdp78419 * This function reads and modifies the globals: 1428ce8eb11aSdp78419 * mpo_lgroup[], n_lgrpnodes 1429ce8eb11aSdp78419 * 1430ce8eb11aSdp78419 * Returns 1 if lgroup nodes were removed, 0 otherwise. 1431ce8eb11aSdp78419 */ 1432ce8eb11aSdp78419 1433ce8eb11aSdp78419 static int 1434ce8eb11aSdp78419 fix_interleave(void) 1435ce8eb11aSdp78419 { 1436ce8eb11aSdp78419 int i, j; 1437ce8eb11aSdp78419 uint64_t mask = 0; 1438ce8eb11aSdp78419 1439ce8eb11aSdp78419 j = 0; 1440ce8eb11aSdp78419 for (i = 0; i < n_lgrpnodes; i++) { 1441ce8eb11aSdp78419 if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) { 1442ce8eb11aSdp78419 /* remove this lgroup */ 1443ce8eb11aSdp78419 mask = mpo_lgroup[i].addr_mask; 1444ce8eb11aSdp78419 } else { 1445ce8eb11aSdp78419 mpo_lgroup[j++] = mpo_lgroup[i]; 1446ce8eb11aSdp78419 } 1447ce8eb11aSdp78419 } 1448ce8eb11aSdp78419 n_lgrpnodes = j; 1449ce8eb11aSdp78419 1450ce8eb11aSdp78419 if (mask != 0) 1451ce8eb11aSdp78419 MPO_STATUS("sub-page interleave %lx found; " 1452ce8eb11aSdp78419 "removing lgroup.\n", mask); 1453ce8eb11aSdp78419 1454ce8eb11aSdp78419 return (mask != 0); 1455ce8eb11aSdp78419 } 1456*9853d9e8SJason Beloro 1457*9853d9e8SJason Beloro /* 1458*9853d9e8SJason Beloro * mblock_alloc 1459*9853d9e8SJason Beloro * 1460*9853d9e8SJason Beloro * Allocate memory for mblock an stripe arrays from either static or 1461*9853d9e8SJason Beloro * dynamic space depending on utype, and return the result in mc. 1462*9853d9e8SJason Beloro * Returns 0 on success and -1 on error. 1463*9853d9e8SJason Beloro */ 1464*9853d9e8SJason Beloro 1465*9853d9e8SJason Beloro static int 1466*9853d9e8SJason Beloro mblock_alloc(mpo_config_t *mc, update_t utype, int nmblocks) 1467*9853d9e8SJason Beloro { 1468*9853d9e8SJason Beloro mblock_md_t *mb = NULL; 1469*9853d9e8SJason Beloro mem_stripe_t *ms = NULL; 1470*9853d9e8SJason Beloro int nstripes = MAX_MEM_NODES * nmblocks; 1471*9853d9e8SJason Beloro size_t mblocksz = nmblocks * sizeof (struct mblock_md); 1472*9853d9e8SJason Beloro size_t mstripesz = nstripes * sizeof (mem_stripe_t); 1473*9853d9e8SJason Beloro size_t allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz)); 1474*9853d9e8SJason Beloro 1475*9853d9e8SJason Beloro /* 1476*9853d9e8SJason Beloro * Allocate space for mblocks and mstripes. 1477*9853d9e8SJason Beloro * 1478*9853d9e8SJason Beloro * For DR allocations, just use kmem_alloc(), and set 1479*9853d9e8SJason Beloro * mc_alloc_sz to indicate it was used. 1480*9853d9e8SJason Beloro * 1481*9853d9e8SJason Beloro * For boot allocation: 1482*9853d9e8SJason Beloro * If we have a small number of mblocks we will use the space 1483*9853d9e8SJason Beloro * that we preallocated. Otherwise, we will dynamically 1484*9853d9e8SJason Beloro * allocate the space from the prom and map it to the 1485*9853d9e8SJason Beloro * reserved VA at MPOBUF_BASE. 1486*9853d9e8SJason Beloro */ 1487*9853d9e8SJason Beloro 1488*9853d9e8SJason Beloro if (utype == U_ADD || utype == U_DEL) { 1489*9853d9e8SJason Beloro mb = (struct mblock_md *)kmem_zalloc(allocsz, KM_SLEEP); 1490*9853d9e8SJason Beloro ms = (mem_stripe_t *)(mb + nmblocks); 1491*9853d9e8SJason Beloro mc->mc_alloc_sz = allocsz; 1492*9853d9e8SJason Beloro } else if (nmblocks <= SMALL_MBLOCKS_COUNT) { 1493*9853d9e8SJason Beloro mb = &small_mpo_mblocks[0]; 1494*9853d9e8SJason Beloro ms = &small_mem_stripes[0]; 1495*9853d9e8SJason Beloro mc->mc_alloc_sz = 0; 1496*9853d9e8SJason Beloro } else { 1497*9853d9e8SJason Beloro /* Ensure that we dont request more space than reserved */ 1498*9853d9e8SJason Beloro if (allocsz > MPOBUF_SIZE) { 1499*9853d9e8SJason Beloro MPO_STATUS("mblock_alloc: Insufficient space " 1500*9853d9e8SJason Beloro "for mblock structures \n"); 1501*9853d9e8SJason Beloro return (-1); 1502*9853d9e8SJason Beloro } 1503*9853d9e8SJason Beloro mb = (struct mblock_md *) 1504*9853d9e8SJason Beloro prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE); 1505*9853d9e8SJason Beloro if (mb != (struct mblock_md *)MPOBUF_BASE) { 1506*9853d9e8SJason Beloro MPO_STATUS("mblock_alloc: Cannot allocate space " 1507*9853d9e8SJason Beloro "for mblocks \n"); 1508*9853d9e8SJason Beloro return (-1); 1509*9853d9e8SJason Beloro } 1510*9853d9e8SJason Beloro mpo_heap32_buf = (caddr_t)MPOBUF_BASE; 1511*9853d9e8SJason Beloro mpo_heap32_bufsz = MPOBUF_SIZE; 1512*9853d9e8SJason Beloro ms = (mem_stripe_t *)(mb + nmblocks); 1513*9853d9e8SJason Beloro mc->mc_alloc_sz = 0; 1514*9853d9e8SJason Beloro } 1515*9853d9e8SJason Beloro mc->mc_mblocks = mb; 1516*9853d9e8SJason Beloro mc->mc_stripes = ms; 1517*9853d9e8SJason Beloro mc->mc_nmblocks = nmblocks; 1518*9853d9e8SJason Beloro mc->mc_nstripes = nstripes; 1519*9853d9e8SJason Beloro MPO_DEBUG("mblock_alloc: mblocks: %d\n", nmblocks); 1520*9853d9e8SJason Beloro return (0); 1521*9853d9e8SJason Beloro } 1522*9853d9e8SJason Beloro 1523*9853d9e8SJason Beloro /* 1524*9853d9e8SJason Beloro * mblock_free 1525*9853d9e8SJason Beloro * 1526*9853d9e8SJason Beloro * Free memory in mc that was allocated by mblock_alloc. 1527*9853d9e8SJason Beloro */ 1528*9853d9e8SJason Beloro 1529*9853d9e8SJason Beloro static void 1530*9853d9e8SJason Beloro mblock_free(mpo_config_t *mc) 1531*9853d9e8SJason Beloro { 1532*9853d9e8SJason Beloro if (mc->mc_alloc_sz > 0) { 1533*9853d9e8SJason Beloro ASSERT(mc->mc_mblocks != mpo_mblock); 1534*9853d9e8SJason Beloro kmem_free((caddr_t)mc->mc_mblocks, mc->mc_alloc_sz); 1535*9853d9e8SJason Beloro } 1536*9853d9e8SJason Beloro bzero(mc, sizeof (*mc)); 1537*9853d9e8SJason Beloro } 1538*9853d9e8SJason Beloro 1539*9853d9e8SJason Beloro /* 1540*9853d9e8SJason Beloro * mblock_install 1541*9853d9e8SJason Beloro * 1542*9853d9e8SJason Beloro * Install mblock config passed in mc as the global configuration. 1543*9853d9e8SJason Beloro * May only be called at boot or while holding mpo_wr_lock. 1544*9853d9e8SJason Beloro */ 1545*9853d9e8SJason Beloro 1546*9853d9e8SJason Beloro static void 1547*9853d9e8SJason Beloro mblock_install(mpo_config_t *mc) 1548*9853d9e8SJason Beloro { 1549*9853d9e8SJason Beloro mpo_mblock = mc->mc_mblocks; 1550*9853d9e8SJason Beloro n_mblocks = mc->mc_nmblocks; 1551*9853d9e8SJason Beloro mem_stripes = mc->mc_stripes; 1552*9853d9e8SJason Beloro n_mem_stripes = mc->mc_nstripes; 1553*9853d9e8SJason Beloro base_ra_to_pa_pfn = btop(mc->mc_mblocks[0].ra_to_pa); 1554*9853d9e8SJason Beloro mpo_config = *mc; 1555*9853d9e8SJason Beloro } 1556*9853d9e8SJason Beloro 1557*9853d9e8SJason Beloro /* 1558*9853d9e8SJason Beloro * mblock_update 1559*9853d9e8SJason Beloro * 1560*9853d9e8SJason Beloro * Traverse mblocknodes, read the mblock properties from the MD, and 1561*9853d9e8SJason Beloro * save the mblocks in mc. 1562*9853d9e8SJason Beloro */ 1563*9853d9e8SJason Beloro 1564*9853d9e8SJason Beloro static void 1565*9853d9e8SJason Beloro mblock_update(mpo_config_t *mc, md_t md, mde_cookie_t *mblocknodes) 1566*9853d9e8SJason Beloro { 1567*9853d9e8SJason Beloro uint64_t i, j; 1568*9853d9e8SJason Beloro int result = 0; 1569*9853d9e8SJason Beloro mblock_md_t *mblock = mc->mc_mblocks; 1570*9853d9e8SJason Beloro 1571*9853d9e8SJason Beloro for (i = 0, j = 0; j < mc->mc_nmblocks; j++) { 1572*9853d9e8SJason Beloro 1573*9853d9e8SJason Beloro /* Without a base or size value we will fail */ 1574*9853d9e8SJason Beloro result = get_int(md, mblocknodes[j], PROP_LG_BASE, 1575*9853d9e8SJason Beloro &mblock[i].base); 1576*9853d9e8SJason Beloro if (result < 0) { 1577*9853d9e8SJason Beloro MPO_STATUS("mblock_update: " 1578*9853d9e8SJason Beloro "PROP_LG_BASE is missing\n"); 1579*9853d9e8SJason Beloro mc->mc_nmblocks = 0; 1580*9853d9e8SJason Beloro return; 1581*9853d9e8SJason Beloro } 1582*9853d9e8SJason Beloro 1583*9853d9e8SJason Beloro result = get_int(md, mblocknodes[j], PROP_LG_SIZE, 1584*9853d9e8SJason Beloro &mblock[i].size); 1585*9853d9e8SJason Beloro if (result < 0) { 1586*9853d9e8SJason Beloro MPO_STATUS("mblock_update: " 1587*9853d9e8SJason Beloro "PROP_LG_SIZE is missing\n"); 1588*9853d9e8SJason Beloro mc->mc_nmblocks = 0; 1589*9853d9e8SJason Beloro return; 1590*9853d9e8SJason Beloro } 1591*9853d9e8SJason Beloro 1592*9853d9e8SJason Beloro result = get_int(md, mblocknodes[j], 1593*9853d9e8SJason Beloro PROP_LG_RA_PA_OFFSET, &mblock[i].ra_to_pa); 1594*9853d9e8SJason Beloro 1595*9853d9e8SJason Beloro /* If we don't have an ra_pa_offset, just set it to 0 */ 1596*9853d9e8SJason Beloro if (result < 0) 1597*9853d9e8SJason Beloro mblock[i].ra_to_pa = 0; 1598*9853d9e8SJason Beloro 1599*9853d9e8SJason Beloro MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, " 1600*9853d9e8SJason Beloro "ra_to_pa = %lx\n", i, 1601*9853d9e8SJason Beloro mblock[i].base, 1602*9853d9e8SJason Beloro mblock[i].size, 1603*9853d9e8SJason Beloro mblock[i].ra_to_pa); 1604*9853d9e8SJason Beloro 1605*9853d9e8SJason Beloro /* check for unsupportable values of base and size */ 1606*9853d9e8SJason Beloro if (mblock[i].base > mblock[i].base + mblock[i].size) { 1607*9853d9e8SJason Beloro MPO_STATUS("mblock_update: " 1608*9853d9e8SJason Beloro "PROP_LG_BASE+PROP_LG_SIZE is invalid: " 1609*9853d9e8SJason Beloro "base = %lx, size = %lx\n", 1610*9853d9e8SJason Beloro mblock[i].base, mblock[i].size); 1611*9853d9e8SJason Beloro mc->mc_nmblocks = 0; 1612*9853d9e8SJason Beloro return; 1613*9853d9e8SJason Beloro } 1614*9853d9e8SJason Beloro 1615*9853d9e8SJason Beloro /* eliminate size==0 blocks */ 1616*9853d9e8SJason Beloro if (mblock[i].size != 0) { 1617*9853d9e8SJason Beloro uint64_t base = mblock[i].base; 1618*9853d9e8SJason Beloro uint64_t end = base + mblock[i].size; 1619*9853d9e8SJason Beloro ASSERT(end > base); 1620*9853d9e8SJason Beloro mblock[i].base_pfn = btop(base); 1621*9853d9e8SJason Beloro mblock[i].end_pfn = btop(end - 1); 1622*9853d9e8SJason Beloro i++; 1623*9853d9e8SJason Beloro } 1624*9853d9e8SJason Beloro } 1625*9853d9e8SJason Beloro 1626*9853d9e8SJason Beloro if (i == 0) { 1627*9853d9e8SJason Beloro MPO_STATUS("mblock_update: " 1628*9853d9e8SJason Beloro "No non-empty mblock nodes were found " 1629*9853d9e8SJason Beloro "in the Machine Descriptor\n"); 1630*9853d9e8SJason Beloro mc->mc_nmblocks = 0; 1631*9853d9e8SJason Beloro return; 1632*9853d9e8SJason Beloro } 1633*9853d9e8SJason Beloro ASSERT(i <= mc->mc_nmblocks); 1634*9853d9e8SJason Beloro mc->mc_nmblocks = i; 1635*9853d9e8SJason Beloro 1636*9853d9e8SJason Beloro /* Must sort mblocks by address for mem_node_iterator_init() */ 1637*9853d9e8SJason Beloro mblock_sort(mblock, mc->mc_nmblocks); 1638*9853d9e8SJason Beloro } 1639*9853d9e8SJason Beloro 1640*9853d9e8SJason Beloro /* 1641*9853d9e8SJason Beloro * mblock_update_add 1642*9853d9e8SJason Beloro * 1643*9853d9e8SJason Beloro * Update mblock config after a memory DR add. The added range is not 1644*9853d9e8SJason Beloro * needed, as we read *all* mblock nodes from the MD. Save the mblocks 1645*9853d9e8SJason Beloro * in mc. 1646*9853d9e8SJason Beloro */ 1647*9853d9e8SJason Beloro 1648*9853d9e8SJason Beloro static void 1649*9853d9e8SJason Beloro mblock_update_add(mpo_config_t *mc) 1650*9853d9e8SJason Beloro { 1651*9853d9e8SJason Beloro md_t *md; 1652*9853d9e8SJason Beloro mde_cookie_t root, *mblocknodes; 1653*9853d9e8SJason Beloro int nmblocks = 0; 1654*9853d9e8SJason Beloro 1655*9853d9e8SJason Beloro if ((md = md_get_handle()) == NULL) { 1656*9853d9e8SJason Beloro MPO_STATUS("Cannot access Machine Descriptor\n"); 1657*9853d9e8SJason Beloro goto error; 1658*9853d9e8SJason Beloro } 1659*9853d9e8SJason Beloro 1660*9853d9e8SJason Beloro if ((root = md_get_root(md)) == MDE_INVAL_ELEM_COOKIE) 1661*9853d9e8SJason Beloro goto error; 1662*9853d9e8SJason Beloro 1663*9853d9e8SJason Beloro nmblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK, "fwd", 1664*9853d9e8SJason Beloro &mblocknodes); 1665*9853d9e8SJason Beloro if (nmblocks <= 0) { 1666*9853d9e8SJason Beloro MPO_STATUS("No mblock nodes detected in Machine Descriptor\n"); 1667*9853d9e8SJason Beloro goto error; 1668*9853d9e8SJason Beloro } 1669*9853d9e8SJason Beloro 1670*9853d9e8SJason Beloro if (mblock_alloc(mc, U_ADD, nmblocks) < 0) 1671*9853d9e8SJason Beloro goto error; 1672*9853d9e8SJason Beloro 1673*9853d9e8SJason Beloro mblock_update(mc, md, mblocknodes); 1674*9853d9e8SJason Beloro md_free_scan_dag(md, &mblocknodes); 1675*9853d9e8SJason Beloro (void) md_fini_handle(md); 1676*9853d9e8SJason Beloro return; 1677*9853d9e8SJason Beloro error: 1678*9853d9e8SJason Beloro panic("mblock_update_add: cannot process mblocks from MD.\n"); 1679*9853d9e8SJason Beloro } 1680*9853d9e8SJason Beloro 1681*9853d9e8SJason Beloro /* 1682*9853d9e8SJason Beloro * mblock_update_del 1683*9853d9e8SJason Beloro * 1684*9853d9e8SJason Beloro * Update mblocks after a memory DR deletion of the range (ubase, uend). 1685*9853d9e8SJason Beloro * Allocate a new mblock config, copy old config to the new, modify the new 1686*9853d9e8SJason Beloro * mblocks to reflect the deletion. The new mblocks are returned in 1687*9853d9e8SJason Beloro * mc_new and are not yet installed as the active config. 1688*9853d9e8SJason Beloro */ 1689*9853d9e8SJason Beloro 1690*9853d9e8SJason Beloro static void 1691*9853d9e8SJason Beloro mblock_update_del(mpo_config_t *mc_new, mpo_config_t *mc_old, pfn_t ubase, 1692*9853d9e8SJason Beloro pfn_t uend) 1693*9853d9e8SJason Beloro { 1694*9853d9e8SJason Beloro int i, j; 1695*9853d9e8SJason Beloro pfn_t base, end; 1696*9853d9e8SJason Beloro mblock_md_t *mblock; 1697*9853d9e8SJason Beloro int nmblocks = mc_old->mc_nmblocks; 1698*9853d9e8SJason Beloro 1699*9853d9e8SJason Beloro MPO_DEBUG("mblock_update_del(0x%lx, 0x%lx)\n", ubase, uend); 1700*9853d9e8SJason Beloro 1701*9853d9e8SJason Beloro /* 1702*9853d9e8SJason Beloro * Allocate mblocks in mc_new and copy the old to the new. 1703*9853d9e8SJason Beloro * Allocate one extra in case the deletion splits an mblock. 1704*9853d9e8SJason Beloro */ 1705*9853d9e8SJason Beloro if (mblock_alloc(mc_new, U_DEL, nmblocks + 1) < 0) 1706*9853d9e8SJason Beloro return; 1707*9853d9e8SJason Beloro mblock = mc_new->mc_mblocks; 1708*9853d9e8SJason Beloro bcopy(mc_old->mc_mblocks, mblock, nmblocks * sizeof (mblock_md_t)); 1709*9853d9e8SJason Beloro 1710*9853d9e8SJason Beloro /* 1711*9853d9e8SJason Beloro * Find the mblock containing the deleted range and adjust it in 1712*9853d9e8SJason Beloro * the new config. 1713*9853d9e8SJason Beloro */ 1714*9853d9e8SJason Beloro for (i = 0; i < nmblocks; i++) { 1715*9853d9e8SJason Beloro 1716*9853d9e8SJason Beloro base = btop(mblock[i].base); 1717*9853d9e8SJason Beloro end = base + btop(mblock[i].size) - 1; 1718*9853d9e8SJason Beloro 1719*9853d9e8SJason Beloro /* 1720*9853d9e8SJason Beloro * Adjust the mblock based on the subset that was deleted. 1721*9853d9e8SJason Beloro * 1722*9853d9e8SJason Beloro * If the entire mblk was deleted, compact the table. 1723*9853d9e8SJason Beloro * 1724*9853d9e8SJason Beloro * If the middle of the mblk was deleted, extend 1725*9853d9e8SJason Beloro * the table. Space for the new slot was already 1726*9853d9e8SJason Beloro * allocated. 1727*9853d9e8SJason Beloro * 1728*9853d9e8SJason Beloro * The memory to be deleted is a mblock or a subset of 1729*9853d9e8SJason Beloro * and does not span multiple mblocks. 1730*9853d9e8SJason Beloro */ 1731*9853d9e8SJason Beloro if (base == ubase && end == uend) { 1732*9853d9e8SJason Beloro for (j = i; j < nmblocks - 1; j++) 1733*9853d9e8SJason Beloro mblock[j] = mblock[j + 1]; 1734*9853d9e8SJason Beloro nmblocks--; 1735*9853d9e8SJason Beloro bzero(&mblock[nmblocks], sizeof (*mblock)); 1736*9853d9e8SJason Beloro break; 1737*9853d9e8SJason Beloro } else if (base < ubase && end > uend) { 1738*9853d9e8SJason Beloro for (j = nmblocks - 1; j >= i; j--) 1739*9853d9e8SJason Beloro mblock[j + 1] = mblock[j]; 1740*9853d9e8SJason Beloro mblock[i].size = ptob(ubase - base); 1741*9853d9e8SJason Beloro mblock[i].end_pfn = ubase - 1; 1742*9853d9e8SJason Beloro mblock[i + 1].base = ptob(uend + 1); 1743*9853d9e8SJason Beloro mblock[i + 1].size = ptob(end - uend); 1744*9853d9e8SJason Beloro mblock[i + 1].base_pfn = uend + 1; 1745*9853d9e8SJason Beloro nmblocks++; 1746*9853d9e8SJason Beloro break; 1747*9853d9e8SJason Beloro } else if (base == ubase) { 1748*9853d9e8SJason Beloro MPO_DEBUG("mblock_update_del: shrink>" 1749*9853d9e8SJason Beloro " i=%d base=0x%lx end=0x%lx", i, base, end); 1750*9853d9e8SJason Beloro mblock[i].base = ptob(uend + 1); 1751*9853d9e8SJason Beloro mblock[i].size -= ptob(uend - ubase + 1); 1752*9853d9e8SJason Beloro base = uend + 1; 1753*9853d9e8SJason Beloro mblock[i].base_pfn = base; 1754*9853d9e8SJason Beloro mblock[i].end_pfn = end; 1755*9853d9e8SJason Beloro MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end); 1756*9853d9e8SJason Beloro break; 1757*9853d9e8SJason Beloro } else if (end == uend) { 1758*9853d9e8SJason Beloro MPO_DEBUG("mblock_update_del: shrink<" 1759*9853d9e8SJason Beloro " i=%d base=0x%lx end=0x%lx", i, base, end); 1760*9853d9e8SJason Beloro mblock[i].size -= ptob(uend - ubase + 1); 1761*9853d9e8SJason Beloro end = ubase - 1; 1762*9853d9e8SJason Beloro mblock[i].base_pfn = base; 1763*9853d9e8SJason Beloro mblock[i].end_pfn = end; 1764*9853d9e8SJason Beloro MPO_DEBUG(" nbase=0x%lx nend=0x%lx\n", base, end); 1765*9853d9e8SJason Beloro break; 1766*9853d9e8SJason Beloro } 1767*9853d9e8SJason Beloro } 1768*9853d9e8SJason Beloro mc_new->mc_nmblocks = nmblocks; 1769*9853d9e8SJason Beloro ASSERT(end > base); 1770*9853d9e8SJason Beloro } 1771*9853d9e8SJason Beloro 1772*9853d9e8SJason Beloro /* 1773*9853d9e8SJason Beloro * mstripe_update 1774*9853d9e8SJason Beloro * 1775*9853d9e8SJason Beloro * Read mblocks from mc and update mstripes in mc 1776*9853d9e8SJason Beloro */ 1777*9853d9e8SJason Beloro 1778*9853d9e8SJason Beloro static void 1779*9853d9e8SJason Beloro mstripe_update(mpo_config_t *mc) 1780*9853d9e8SJason Beloro { 1781*9853d9e8SJason Beloro lgrp_handle_t lgrphand, lgrp_start; 1782*9853d9e8SJason Beloro int i, mnode; 1783*9853d9e8SJason Beloro uint64_t offset, stripe_end, base, end, ra_to_pa, stride; 1784*9853d9e8SJason Beloro uint64_t stripe, frag, remove; 1785*9853d9e8SJason Beloro mem_stripe_t *ms; 1786*9853d9e8SJason Beloro mblock_md_t *mblock = mc->mc_mblocks; 1787*9853d9e8SJason Beloro int nmblocks = mc->mc_nmblocks; 1788*9853d9e8SJason Beloro int mstripesz = MAX_MEM_NODES * nmblocks * sizeof (mem_stripe_t); 1789*9853d9e8SJason Beloro 1790*9853d9e8SJason Beloro /* Check for non-MPO sun4v platforms or memory DR removal */ 1791*9853d9e8SJason Beloro if (n_locality_groups <= 1) { 1792*9853d9e8SJason Beloro ASSERT(n_locality_groups == 1); 1793*9853d9e8SJason Beloro ASSERT(max_locality_groups == 1 && max_mem_nodes == 1); 1794*9853d9e8SJason Beloro 1795*9853d9e8SJason Beloro if (nmblocks == 1) { 1796*9853d9e8SJason Beloro mc->mc_nstripes = 0; 1797*9853d9e8SJason Beloro } else { 1798*9853d9e8SJason Beloro mc->mc_nstripes = nmblocks; 1799*9853d9e8SJason Beloro bzero(mc->mc_stripes, mstripesz); 1800*9853d9e8SJason Beloro for (i = 0; i < nmblocks; i++) { 1801*9853d9e8SJason Beloro mc->mc_stripes[i].exists = 1; 1802*9853d9e8SJason Beloro mc->mc_stripes[i].physbase = mblock[i].base_pfn; 1803*9853d9e8SJason Beloro mc->mc_stripes[i].physmax = mblock[i].end_pfn; 1804*9853d9e8SJason Beloro } 1805*9853d9e8SJason Beloro } 1806*9853d9e8SJason Beloro return; 1807*9853d9e8SJason Beloro } 1808*9853d9e8SJason Beloro 1809*9853d9e8SJason Beloro bzero(mc->mc_stripes, mstripesz); 1810*9853d9e8SJason Beloro mc->mc_nstripes = max_locality_groups * nmblocks; 1811*9853d9e8SJason Beloro stripe = ptob(mnode_pages); 1812*9853d9e8SJason Beloro stride = max_locality_groups * stripe; 1813*9853d9e8SJason Beloro 1814*9853d9e8SJason Beloro for (i = 0; i < nmblocks; i++) { 1815*9853d9e8SJason Beloro base = mblock[i].base; 1816*9853d9e8SJason Beloro end = base + mblock[i].size; 1817*9853d9e8SJason Beloro ra_to_pa = mblock[i].ra_to_pa; 1818*9853d9e8SJason Beloro 1819*9853d9e8SJason Beloro /* Find the offset from the prev stripe boundary in PA space. */ 1820*9853d9e8SJason Beloro offset = (base + ra_to_pa) & (stripe - 1); 1821*9853d9e8SJason Beloro 1822*9853d9e8SJason Beloro /* Set the next stripe boundary. */ 1823*9853d9e8SJason Beloro stripe_end = base - offset + stripe; 1824*9853d9e8SJason Beloro 1825*9853d9e8SJason Beloro lgrp_start = (((base + ra_to_pa) & home_mask) >> 1826*9853d9e8SJason Beloro home_mask_shift); 1827*9853d9e8SJason Beloro lgrphand = lgrp_start; 1828*9853d9e8SJason Beloro 1829*9853d9e8SJason Beloro /* 1830*9853d9e8SJason Beloro * Loop over all lgroups covered by the mblock, creating a 1831*9853d9e8SJason Beloro * stripe for each. Stop when lgrp_start is visited again. 1832*9853d9e8SJason Beloro */ 1833*9853d9e8SJason Beloro do { 1834*9853d9e8SJason Beloro /* mblock may not span all lgroups */ 1835*9853d9e8SJason Beloro if (base >= end) 1836*9853d9e8SJason Beloro break; 1837*9853d9e8SJason Beloro 1838*9853d9e8SJason Beloro mnode = lgrphand; 1839*9853d9e8SJason Beloro ASSERT(mnode < max_mem_nodes); 1840*9853d9e8SJason Beloro 1841*9853d9e8SJason Beloro /* 1842*9853d9e8SJason Beloro * Calculate the size of the fragment that does not 1843*9853d9e8SJason Beloro * belong to the mnode in the last partial stride. 1844*9853d9e8SJason Beloro */ 1845*9853d9e8SJason Beloro frag = (end - (base - offset)) & (stride - 1); 1846*9853d9e8SJason Beloro if (frag == 0) { 1847*9853d9e8SJason Beloro /* remove the gap */ 1848*9853d9e8SJason Beloro remove = stride - stripe; 1849*9853d9e8SJason Beloro } else if (frag < stripe) { 1850*9853d9e8SJason Beloro /* fragment fits in stripe; keep it all */ 1851*9853d9e8SJason Beloro remove = 0; 1852*9853d9e8SJason Beloro } else { 1853*9853d9e8SJason Beloro /* fragment is large; trim after whole stripe */ 1854*9853d9e8SJason Beloro remove = frag - stripe; 1855*9853d9e8SJason Beloro } 1856*9853d9e8SJason Beloro 1857*9853d9e8SJason Beloro ms = &mc->mc_stripes[i * max_locality_groups + mnode]; 1858*9853d9e8SJason Beloro ms->physbase = btop(base); 1859*9853d9e8SJason Beloro ms->physmax = btop(end - 1 - remove); 1860*9853d9e8SJason Beloro ms->offset = btop(offset); 1861*9853d9e8SJason Beloro ms->exists = 1; 1862*9853d9e8SJason Beloro 1863*9853d9e8SJason Beloro base = stripe_end; 1864*9853d9e8SJason Beloro stripe_end += stripe; 1865*9853d9e8SJason Beloro offset = 0; 1866*9853d9e8SJason Beloro lgrphand = (((base + ra_to_pa) & home_mask) >> 1867*9853d9e8SJason Beloro home_mask_shift); 1868*9853d9e8SJason Beloro } while (lgrphand != lgrp_start); 1869*9853d9e8SJason Beloro } 1870*9853d9e8SJason Beloro } 1871*9853d9e8SJason Beloro 1872*9853d9e8SJason Beloro #define INTERSECT(a, b, c, d) \ 1873*9853d9e8SJason Beloro if (((a) >= (c) && (a) <= (d)) || \ 1874*9853d9e8SJason Beloro ((c) >= (a) && (c) <= (b))) { \ 1875*9853d9e8SJason Beloro (c) = MAX((a), (c)); \ 1876*9853d9e8SJason Beloro (d) = MIN((b), (d)); \ 1877*9853d9e8SJason Beloro } else { \ 1878*9853d9e8SJason Beloro ASSERT((a) >= (d) || (b) <= (c)); \ 1879*9853d9e8SJason Beloro continue; \ 1880*9853d9e8SJason Beloro } \ 1881*9853d9e8SJason Beloro 1882*9853d9e8SJason Beloro /* 1883*9853d9e8SJason Beloro * mnode_update 1884*9853d9e8SJason Beloro * 1885*9853d9e8SJason Beloro * Read stripes from mc and update mnode extents. The mnode extents are 1886*9853d9e8SJason Beloro * part of the live configuration, so this can only be done at boot time 1887*9853d9e8SJason Beloro * or while holding the mpo_wr_lock. 1888*9853d9e8SJason Beloro */ 1889*9853d9e8SJason Beloro 1890*9853d9e8SJason Beloro static void 1891*9853d9e8SJason Beloro mnode_update(mpo_config_t *mc, pfn_t ubase, pfn_t uend, update_t utype) 1892*9853d9e8SJason Beloro { 1893*9853d9e8SJason Beloro int i, j, mnode, found; 1894*9853d9e8SJason Beloro pfn_t base, end; 1895*9853d9e8SJason Beloro mem_stripe_t *ms; 1896*9853d9e8SJason Beloro 1897*9853d9e8SJason Beloro MPO_DEBUG("mnode_udpate: basepfn: %lx endpfn: %lx\n", ubase, uend); 1898*9853d9e8SJason Beloro 1899*9853d9e8SJason Beloro if (n_locality_groups <= 1 && mc->mc_nmblocks == 1) { 1900*9853d9e8SJason Beloro if (utype == U_ADD) 1901*9853d9e8SJason Beloro mpo_mem_node_add_slice(ubase, uend); 1902*9853d9e8SJason Beloro else if (utype == U_DEL) 1903*9853d9e8SJason Beloro mpo_mem_node_del_slice(ubase, uend); 1904*9853d9e8SJason Beloro else 1905*9853d9e8SJason Beloro panic("mnode update: %d: invalid\n", utype); 1906*9853d9e8SJason Beloro return; 1907*9853d9e8SJason Beloro } 1908*9853d9e8SJason Beloro 1909*9853d9e8SJason Beloro found = 0; 1910*9853d9e8SJason Beloro for (i = 0; i < mc->mc_nmblocks; i++) { 1911*9853d9e8SJason Beloro for (mnode = 0; mnode < max_locality_groups; mnode++) { 1912*9853d9e8SJason Beloro 1913*9853d9e8SJason Beloro j = i * max_locality_groups + mnode; 1914*9853d9e8SJason Beloro ms = &mc->mc_stripes[j]; 1915*9853d9e8SJason Beloro if (!ms->exists) 1916*9853d9e8SJason Beloro continue; 1917*9853d9e8SJason Beloro 1918*9853d9e8SJason Beloro base = ms->physbase; 1919*9853d9e8SJason Beloro end = ms->physmax; 1920*9853d9e8SJason Beloro 1921*9853d9e8SJason Beloro /* 1922*9853d9e8SJason Beloro * Look for the mstripes intersecting this slice. 1923*9853d9e8SJason Beloro * 1924*9853d9e8SJason Beloro * The mstripe and slice pairs may not be equal 1925*9853d9e8SJason Beloro * if a subset of a mblock is added/deleted. 1926*9853d9e8SJason Beloro */ 1927*9853d9e8SJason Beloro switch (utype) { 1928*9853d9e8SJason Beloro case U_ADD: 1929*9853d9e8SJason Beloro INTERSECT(ubase, uend, base, end); 1930*9853d9e8SJason Beloro /*FALLTHROUGH*/ 1931*9853d9e8SJason Beloro case U_ADD_ALL: 1932*9853d9e8SJason Beloro if (n_locality_groups > 1) 1933*9853d9e8SJason Beloro mpo_plat_assign_lgrphand_to_mem_node( 1934*9853d9e8SJason Beloro mnode, mnode); 1935*9853d9e8SJason Beloro mpo_mem_node_add_slice(base, end); 1936*9853d9e8SJason Beloro break; 1937*9853d9e8SJason Beloro case U_DEL: 1938*9853d9e8SJason Beloro INTERSECT(ubase, uend, base, end); 1939*9853d9e8SJason Beloro mpo_mem_node_del_slice(base, end); 1940*9853d9e8SJason Beloro break; 1941*9853d9e8SJason Beloro default: 1942*9853d9e8SJason Beloro panic("mnode_update: %d: invalid\n", utype); 1943*9853d9e8SJason Beloro break; 1944*9853d9e8SJason Beloro } 1945*9853d9e8SJason Beloro 1946*9853d9e8SJason Beloro found++; 1947*9853d9e8SJason Beloro } 1948*9853d9e8SJason Beloro } 1949*9853d9e8SJason Beloro 1950*9853d9e8SJason Beloro if (!found) 1951*9853d9e8SJason Beloro panic("mnode_update: mstripe not found"); 1952*9853d9e8SJason Beloro 1953*9853d9e8SJason Beloro #ifdef DEBUG 1954*9853d9e8SJason Beloro if (utype == U_ADD_ALL || utype == U_DEL) 1955*9853d9e8SJason Beloro return; 1956*9853d9e8SJason Beloro found = 0; 1957*9853d9e8SJason Beloro for (i = 0; i < max_mem_nodes; i++) { 1958*9853d9e8SJason Beloro if (!mem_node_config[i].exists) 1959*9853d9e8SJason Beloro continue; 1960*9853d9e8SJason Beloro if (ubase >= mem_node_config[i].physbase && 1961*9853d9e8SJason Beloro ubase <= mem_node_config[i].physmax) 1962*9853d9e8SJason Beloro found |= 1; 1963*9853d9e8SJason Beloro if (uend >= mem_node_config[i].physbase && 1964*9853d9e8SJason Beloro uend <= mem_node_config[i].physmax) 1965*9853d9e8SJason Beloro found |= 2; 1966*9853d9e8SJason Beloro } 1967*9853d9e8SJason Beloro ASSERT(found == 3); 1968*9853d9e8SJason Beloro { 1969*9853d9e8SJason Beloro pfn_t minpfn, maxpfn; 1970*9853d9e8SJason Beloro 1971*9853d9e8SJason Beloro mem_node_max_range(&minpfn, &maxpfn); 1972*9853d9e8SJason Beloro ASSERT(minpfn <= ubase); 1973*9853d9e8SJason Beloro ASSERT(maxpfn >= uend); 1974*9853d9e8SJason Beloro } 1975*9853d9e8SJason Beloro #endif 1976*9853d9e8SJason Beloro } 1977*9853d9e8SJason Beloro 1978*9853d9e8SJason Beloro /* 1979*9853d9e8SJason Beloro * Plat_slice_add()/plat_slice_del() are the platform hooks 1980*9853d9e8SJason Beloro * for adding/deleting a pfn range to/from the system. 1981*9853d9e8SJason Beloro * 1982*9853d9e8SJason Beloro * Platform_slice_add() is used for both boot/DR cases. 1983*9853d9e8SJason Beloro * 1984*9853d9e8SJason Beloro * - Zeus has already added the mblocks to the MD, so read the updated 1985*9853d9e8SJason Beloro * MD and allocate all data structures required to manage the new memory 1986*9853d9e8SJason Beloro * configuration. 1987*9853d9e8SJason Beloro * 1988*9853d9e8SJason Beloro * - Recompute the stripes which are derived from the mblocks. 1989*9853d9e8SJason Beloro * 1990*9853d9e8SJason Beloro * - Update (expand) the mnode extents and install the modified mblocks as 1991*9853d9e8SJason Beloro * the new mpo config. This must be done while holding the mpo_wr_lock 1992*9853d9e8SJason Beloro * to guarantee that no other threads access the mpo meta-data. 1993*9853d9e8SJason Beloro * 1994*9853d9e8SJason Beloro * - Unlock MPO data structures; the new config is live. Free the old config. 1995*9853d9e8SJason Beloro * 1996*9853d9e8SJason Beloro * Plat_slice_del() is used for DR only. 1997*9853d9e8SJason Beloro * 1998*9853d9e8SJason Beloro * - Zeus has not yet modified the MD to reflect the deletion, so copy 1999*9853d9e8SJason Beloro * the old mpo mblocks and delete the range from the copy. 2000*9853d9e8SJason Beloro * 2001*9853d9e8SJason Beloro * - Recompute the stripes which are derived from the mblocks. 2002*9853d9e8SJason Beloro * 2003*9853d9e8SJason Beloro * - Update (shrink) the mnode extents and install the modified mblocks as 2004*9853d9e8SJason Beloro * the new mpo config. This must be done while holding the mpo_wr_lock 2005*9853d9e8SJason Beloro * to guarantee that no other threads access the mpo meta-data. 2006*9853d9e8SJason Beloro * 2007*9853d9e8SJason Beloro * - Unlock MPO data structures; the new config is live. Free the old config. 2008*9853d9e8SJason Beloro */ 2009*9853d9e8SJason Beloro 2010*9853d9e8SJason Beloro void 2011*9853d9e8SJason Beloro plat_slice_add(pfn_t base, pfn_t end) 2012*9853d9e8SJason Beloro { 2013*9853d9e8SJason Beloro mpo_config_t old_config = mpo_config; 2014*9853d9e8SJason Beloro mpo_config_t new_config; 2015*9853d9e8SJason Beloro 2016*9853d9e8SJason Beloro VALIDATE_SLICE(base, end); 2017*9853d9e8SJason Beloro mblock_update_add(&new_config); 2018*9853d9e8SJason Beloro mstripe_update(&new_config); 2019*9853d9e8SJason Beloro mpo_wr_lock(); 2020*9853d9e8SJason Beloro mblock_install(&new_config); 2021*9853d9e8SJason Beloro /* Use new config to add all ranges for mnode_update */ 2022*9853d9e8SJason Beloro mnode_update(&new_config, base, end, U_ADD); 2023*9853d9e8SJason Beloro mpo_genid++; 2024*9853d9e8SJason Beloro mpo_wr_unlock(); 2025*9853d9e8SJason Beloro mblock_free(&old_config); 2026*9853d9e8SJason Beloro } 2027*9853d9e8SJason Beloro 2028*9853d9e8SJason Beloro void 2029*9853d9e8SJason Beloro plat_slice_del(pfn_t base, pfn_t end) 2030*9853d9e8SJason Beloro { 2031*9853d9e8SJason Beloro mpo_config_t old_config = mpo_config; 2032*9853d9e8SJason Beloro mpo_config_t new_config; 2033*9853d9e8SJason Beloro 2034*9853d9e8SJason Beloro VALIDATE_SLICE(base, end); 2035*9853d9e8SJason Beloro mblock_update_del(&new_config, &old_config, base, end); 2036*9853d9e8SJason Beloro mstripe_update(&new_config); 2037*9853d9e8SJason Beloro mpo_wr_lock(); 2038*9853d9e8SJason Beloro /* Use old config to find deleted range for mnode_update */ 2039*9853d9e8SJason Beloro mnode_update(&old_config, base, end, U_DEL); 2040*9853d9e8SJason Beloro mblock_install(&new_config); 2041*9853d9e8SJason Beloro mpo_genid++; 2042*9853d9e8SJason Beloro mpo_wr_unlock(); 2043*9853d9e8SJason Beloro mblock_free(&old_config); 2044*9853d9e8SJason Beloro } 2045