xref: /titanic_52/usr/src/uts/sun4v/os/mpo.c (revision ce8eb11a8717b4a57c68fd77ab9f8aac15b16bf2)
1*ce8eb11aSdp78419 /*
2*ce8eb11aSdp78419  * CDDL HEADER START
3*ce8eb11aSdp78419  *
4*ce8eb11aSdp78419  * The contents of this file are subject to the terms of the
5*ce8eb11aSdp78419  * Common Development and Distribution License (the "License").
6*ce8eb11aSdp78419  * You may not use this file except in compliance with the License.
7*ce8eb11aSdp78419  *
8*ce8eb11aSdp78419  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*ce8eb11aSdp78419  * or http://www.opensolaris.org/os/licensing.
10*ce8eb11aSdp78419  * See the License for the specific language governing permissions
11*ce8eb11aSdp78419  * and limitations under the License.
12*ce8eb11aSdp78419  *
13*ce8eb11aSdp78419  * When distributing Covered Code, include this CDDL HEADER in each
14*ce8eb11aSdp78419  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*ce8eb11aSdp78419  * If applicable, add the following below this CDDL HEADER, with the
16*ce8eb11aSdp78419  * fields enclosed by brackets "[]" replaced with your own identifying
17*ce8eb11aSdp78419  * information: Portions Copyright [yyyy] [name of copyright owner]
18*ce8eb11aSdp78419  *
19*ce8eb11aSdp78419  * CDDL HEADER END
20*ce8eb11aSdp78419  */
21*ce8eb11aSdp78419 
22*ce8eb11aSdp78419 /*
23*ce8eb11aSdp78419  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24*ce8eb11aSdp78419  * Use is subject to license terms.
25*ce8eb11aSdp78419  */
26*ce8eb11aSdp78419 
27*ce8eb11aSdp78419 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*ce8eb11aSdp78419 
29*ce8eb11aSdp78419 #include <sys/types.h>
30*ce8eb11aSdp78419 #include <sys/sysmacros.h>
31*ce8eb11aSdp78419 #include <sys/machsystm.h>
32*ce8eb11aSdp78419 #include <sys/machparam.h>
33*ce8eb11aSdp78419 #include <sys/cmn_err.h>
34*ce8eb11aSdp78419 #include <sys/stat.h>
35*ce8eb11aSdp78419 #include <sys/mach_descrip.h>
36*ce8eb11aSdp78419 #include <sys/memnode.h>
37*ce8eb11aSdp78419 #include <sys/mdesc.h>
38*ce8eb11aSdp78419 #include <sys/mpo.h>
39*ce8eb11aSdp78419 #include <vm/vm_dep.h>
40*ce8eb11aSdp78419 
41*ce8eb11aSdp78419 /*
42*ce8eb11aSdp78419  * MPO and the sun4v memory representation
43*ce8eb11aSdp78419  * ---------------------------------------
44*ce8eb11aSdp78419  *
45*ce8eb11aSdp78419  * Latency groups are defined in the sun4v achitecture by memory-latency-group
46*ce8eb11aSdp78419  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
47*ce8eb11aSdp78419  * tie together cpu nodes and mblock nodes, and contain mask and match
48*ce8eb11aSdp78419  * properties that identify the portion of an mblock that belongs to the
49*ce8eb11aSdp78419  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
50*ce8eb11aSdp78419  * but an mblock defines Real Addresses (RA).  To translate, the mblock
51*ce8eb11aSdp78419  * includes the property address-congruence-offset, hereafter referred to as
52*ce8eb11aSdp78419  * ra_to_pa.  A real address ra is a member of an lgroup if
53*ce8eb11aSdp78419  *
54*ce8eb11aSdp78419  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
55*ce8eb11aSdp78419  *
56*ce8eb11aSdp78419  * The MD is traversed, and information on all mblocks is kept in the array
57*ce8eb11aSdp78419  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
58*ce8eb11aSdp78419  * to, is kept in the array mpo_cpu[].
59*ce8eb11aSdp78419  *
60*ce8eb11aSdp78419  * This implementation makes (and verifies) the simplifying assumption that
61*ce8eb11aSdp78419  * the mask bits are the same for all defined lgroups, and that all 1 bits in
62*ce8eb11aSdp78419  * the mask are contiguous.  Thus the number of lgroups is bounded by the
63*ce8eb11aSdp78419  * number of possible mask values, and the lgrp_handle_t is defined as the
64*ce8eb11aSdp78419  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
65*ce8eb11aSdp78419  * masks and values are also referred to as "home bits" in the code.
66*ce8eb11aSdp78419  *
67*ce8eb11aSdp78419  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
68*ce8eb11aSdp78419  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
69*ce8eb11aSdp78419  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
70*ce8eb11aSdp78419  * home bits.  This yields the mem_node.
71*ce8eb11aSdp78419  *
72*ce8eb11aSdp78419  * Interfaces
73*ce8eb11aSdp78419  * ----------
74*ce8eb11aSdp78419  *
75*ce8eb11aSdp78419  * This file exports the following entry points:
76*ce8eb11aSdp78419  *
77*ce8eb11aSdp78419  * plat_lgrp_init()
78*ce8eb11aSdp78419  * plat_build_mem_nodes()
79*ce8eb11aSdp78419  * plat_lgrp_cpu_to_hand()
80*ce8eb11aSdp78419  * plat_lgrp_latency()
81*ce8eb11aSdp78419  * plat_pfn_to_mem_node()
82*ce8eb11aSdp78419  *	These implement the usual platform lgroup interfaces.
83*ce8eb11aSdp78419  *
84*ce8eb11aSdp78419  * plat_rapfn_to_papfn()
85*ce8eb11aSdp78419  *	Recover the PA page coloring bits from an RA.
86*ce8eb11aSdp78419  *
87*ce8eb11aSdp78419  * plat_mem_node_iterator_init()
88*ce8eb11aSdp78419  *	Initialize an iterator to efficiently step through pages in a mem_node.
89*ce8eb11aSdp78419  *
90*ce8eb11aSdp78419  * plat_mem_node_intersect_range()
91*ce8eb11aSdp78419  *	Find the intersection with a mem_node.
92*ce8eb11aSdp78419  */
93*ce8eb11aSdp78419 
94*ce8eb11aSdp78419 int	sun4v_mpo_enable = 1;
95*ce8eb11aSdp78419 int	sun4v_mpo_debug = 0;
96*ce8eb11aSdp78419 char	sun4v_mpo_status[256] = "";
97*ce8eb11aSdp78419 
98*ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */
99*ce8eb11aSdp78419 static	struct cpu_md mpo_cpu[NCPU];
100*ce8eb11aSdp78419 
101*ce8eb11aSdp78419 /* Save lgroup info from the MD */
102*ce8eb11aSdp78419 #define	MAX_MD_LGROUPS 32
103*ce8eb11aSdp78419 static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
104*ce8eb11aSdp78419 static	int	n_lgrpnodes = 0;
105*ce8eb11aSdp78419 static	int	n_locality_groups = 0;
106*ce8eb11aSdp78419 static	int	max_locality_groups = 0;
107*ce8eb11aSdp78419 
108*ce8eb11aSdp78419 /* Save mblocks from the MD */
109*ce8eb11aSdp78419 static 	struct	mblock_md mpo_mblock[MPO_MAX_MBLOCKS];
110*ce8eb11aSdp78419 static	int	n_mblocks = 0;
111*ce8eb11aSdp78419 
112*ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */
113*ce8eb11aSdp78419 static mem_stripe_t mem_stripes[MAX_MEM_STRIPES];
114*ce8eb11aSdp78419 static	int	n_mem_stripes = 0;
115*ce8eb11aSdp78419 static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
116*ce8eb11aSdp78419 static	int	stripe_shift;	/* stride/stripes expressed as a shift */
117*ce8eb11aSdp78419 static	pfn_t	mnode_pages;	/* mem_node stripe width */
118*ce8eb11aSdp78419 
119*ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */
120*ce8eb11aSdp78419 static	uint64_t home_mask = 0;
121*ce8eb11aSdp78419 static	pfn_t	home_mask_pfn = 0;
122*ce8eb11aSdp78419 static	int	home_mask_shift = 0;
123*ce8eb11aSdp78419 static	uint_t	home_mask_pfn_shift = 0;
124*ce8eb11aSdp78419 
125*ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */
126*ce8eb11aSdp78419 static	int	lower_latency = 0;
127*ce8eb11aSdp78419 static	int	higher_latency = 0;
128*ce8eb11aSdp78419 
129*ce8eb11aSdp78419 static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
130*ce8eb11aSdp78419 
131*ce8eb11aSdp78419 static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
132*ce8eb11aSdp78419 static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
133*ce8eb11aSdp78419 static	int	fix_interleave(void);
134*ce8eb11aSdp78419 
135*ce8eb11aSdp78419 /* Debug support */
136*ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint)
137*ce8eb11aSdp78419 #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
138*ce8eb11aSdp78419 #else
139*ce8eb11aSdp78419 #define	MPO_DEBUG(...)
140*ce8eb11aSdp78419 #endif	/* DEBUG */
141*ce8eb11aSdp78419 
142*ce8eb11aSdp78419 /* Record status message, viewable from mdb */
143*ce8eb11aSdp78419 #define	MPO_STATUS(args...) {						      \
144*ce8eb11aSdp78419 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
145*ce8eb11aSdp78419 	MPO_DEBUG(sun4v_mpo_status);					      \
146*ce8eb11aSdp78419 }
147*ce8eb11aSdp78419 
148*ce8eb11aSdp78419 /*
149*ce8eb11aSdp78419  * Routine to read a uint64_t from a given md
150*ce8eb11aSdp78419  */
151*ce8eb11aSdp78419 static	int64_t
152*ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
153*ce8eb11aSdp78419 {
154*ce8eb11aSdp78419 	int err = md_get_prop_val(md, node, propname, val);
155*ce8eb11aSdp78419 	return (err);
156*ce8eb11aSdp78419 }
157*ce8eb11aSdp78419 
158*ce8eb11aSdp78419 static int
159*ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b)
160*ce8eb11aSdp78419 {
161*ce8eb11aSdp78419 	struct mblock_md *m1 = (struct mblock_md *)a;
162*ce8eb11aSdp78419 	struct mblock_md *m2 = (struct mblock_md *)b;
163*ce8eb11aSdp78419 
164*ce8eb11aSdp78419 	if (m1->base < m2->base)
165*ce8eb11aSdp78419 		return (-1);
166*ce8eb11aSdp78419 	else if (m1->base == m2->base)
167*ce8eb11aSdp78419 		return (0);
168*ce8eb11aSdp78419 	else
169*ce8eb11aSdp78419 		return (1);
170*ce8eb11aSdp78419 }
171*ce8eb11aSdp78419 
172*ce8eb11aSdp78419 static void
173*ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n)
174*ce8eb11aSdp78419 {
175*ce8eb11aSdp78419 	extern void qsort(void *, size_t, size_t,
176*ce8eb11aSdp78419 	    int (*)(const void *, const void *));
177*ce8eb11aSdp78419 
178*ce8eb11aSdp78419 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
179*ce8eb11aSdp78419 }
180*ce8eb11aSdp78419 
181*ce8eb11aSdp78419 /*
182*ce8eb11aSdp78419  *
183*ce8eb11aSdp78419  * Traverse the MD to determine:
184*ce8eb11aSdp78419  *
185*ce8eb11aSdp78419  *  Number of CPU nodes, lgrp_nodes, and mblocks
186*ce8eb11aSdp78419  *  Then for each lgrp_node, obtain the appropriate data.
187*ce8eb11aSdp78419  *  For each CPU, determine its home locality and store it.
188*ce8eb11aSdp78419  *  For each mblock, retrieve its data and store it.
189*ce8eb11aSdp78419  */
190*ce8eb11aSdp78419 static	int
191*ce8eb11aSdp78419 lgrp_traverse(md_t *md)
192*ce8eb11aSdp78419 {
193*ce8eb11aSdp78419 	mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
194*ce8eb11aSdp78419 	uint64_t i, j, k, o, n_nodes;
195*ce8eb11aSdp78419 	uint64_t n_lgroups = 0;
196*ce8eb11aSdp78419 	uint64_t mem_lg_homeset = 0;
197*ce8eb11aSdp78419 	int ret_val = 0;
198*ce8eb11aSdp78419 	int result = 0;
199*ce8eb11aSdp78419 	int n_cpunodes = 0;
200*ce8eb11aSdp78419 	int sub_page_fix;
201*ce8eb11aSdp78419 
202*ce8eb11aSdp78419 	n_nodes = md_node_count(md);
203*ce8eb11aSdp78419 
204*ce8eb11aSdp78419 	if (n_nodes <= 0) {
205*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No nodes in node count\n");
206*ce8eb11aSdp78419 		ret_val = -1;
207*ce8eb11aSdp78419 		goto fail;
208*ce8eb11aSdp78419 	}
209*ce8eb11aSdp78419 
210*ce8eb11aSdp78419 	root = md_root_node(md);
211*ce8eb11aSdp78419 
212*ce8eb11aSdp78419 	if (root == MDE_INVAL_ELEM_COOKIE) {
213*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: Root node is missing\n");
214*ce8eb11aSdp78419 		ret_val = -1;
215*ce8eb11aSdp78419 		goto fail;
216*ce8eb11aSdp78419 	}
217*ce8eb11aSdp78419 
218*ce8eb11aSdp78419 	/*
219*ce8eb11aSdp78419 	 * Build the Memory Nodes.  Do this before any possibility of
220*ce8eb11aSdp78419 	 * bailing from this routine so we obtain ra_to_pa (needed for page
221*ce8eb11aSdp78419 	 * coloring) even when there are no lgroups defined.
222*ce8eb11aSdp78419 	 */
223*ce8eb11aSdp78419 
224*ce8eb11aSdp78419 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
225*ce8eb11aSdp78419 	    "fwd", &mblocknodes);
226*ce8eb11aSdp78419 
227*ce8eb11aSdp78419 	if (n_mblocks <= 0 || n_mblocks > MPO_MAX_MBLOCKS) {
228*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No mblock "
229*ce8eb11aSdp78419 		    "nodes detected in Machine Descriptor\n");
230*ce8eb11aSdp78419 		n_mblocks = 0;
231*ce8eb11aSdp78419 		ret_val = -1;
232*ce8eb11aSdp78419 		goto fail;
233*ce8eb11aSdp78419 	}
234*ce8eb11aSdp78419 
235*ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
236*ce8eb11aSdp78419 		mpo_mblock[i].node = mblocknodes[i];
237*ce8eb11aSdp78419 
238*ce8eb11aSdp78419 		/* Without a base or size value we will fail */
239*ce8eb11aSdp78419 		result = get_int(md, mblocknodes[i], PROP_LG_BASE,
240*ce8eb11aSdp78419 		    &mpo_mblock[i].base);
241*ce8eb11aSdp78419 		if (result < 0) {
242*ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: "
243*ce8eb11aSdp78419 			    "PROP_LG_BASE is missing\n");
244*ce8eb11aSdp78419 			n_mblocks = 0;
245*ce8eb11aSdp78419 			ret_val = -1;
246*ce8eb11aSdp78419 			goto fail;
247*ce8eb11aSdp78419 		}
248*ce8eb11aSdp78419 
249*ce8eb11aSdp78419 		result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
250*ce8eb11aSdp78419 		    &mpo_mblock[i].size);
251*ce8eb11aSdp78419 		if (result < 0) {
252*ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: "
253*ce8eb11aSdp78419 			    "PROP_LG_SIZE is missing\n");
254*ce8eb11aSdp78419 			n_mblocks = 0;
255*ce8eb11aSdp78419 			ret_val = -1;
256*ce8eb11aSdp78419 			goto fail;
257*ce8eb11aSdp78419 		}
258*ce8eb11aSdp78419 
259*ce8eb11aSdp78419 		result = get_int(md, mblocknodes[i],
260*ce8eb11aSdp78419 		    PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
261*ce8eb11aSdp78419 
262*ce8eb11aSdp78419 		/* If we don't have an ra_pa_offset, just set it to 0 */
263*ce8eb11aSdp78419 		if (result < 0)
264*ce8eb11aSdp78419 			mpo_mblock[i].ra_to_pa = 0;
265*ce8eb11aSdp78419 
266*ce8eb11aSdp78419 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
267*ce8eb11aSdp78419 		    "ra_to_pa = %lx\n", i,
268*ce8eb11aSdp78419 		    mpo_mblock[i].base,
269*ce8eb11aSdp78419 		    mpo_mblock[i].size,
270*ce8eb11aSdp78419 		    mpo_mblock[i].ra_to_pa);
271*ce8eb11aSdp78419 	}
272*ce8eb11aSdp78419 
273*ce8eb11aSdp78419 	/* Must sort mblocks by address for mem_node_iterator_init() */
274*ce8eb11aSdp78419 	mblock_sort(mpo_mblock, n_mblocks);
275*ce8eb11aSdp78419 
276*ce8eb11aSdp78419 	base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
277*ce8eb11aSdp78419 
278*ce8eb11aSdp78419 	/* Page coloring hook is required so we can iterate through mnodes */
279*ce8eb11aSdp78419 	if (&page_next_pfn_for_color_cpu == NULL) {
280*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
281*ce8eb11aSdp78419 		ret_val = -1;
282*ce8eb11aSdp78419 		goto fail;
283*ce8eb11aSdp78419 	}
284*ce8eb11aSdp78419 
285*ce8eb11aSdp78419 	/* Global enable for mpo */
286*ce8eb11aSdp78419 	if (sun4v_mpo_enable == 0) {
287*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
288*ce8eb11aSdp78419 		ret_val = -1;
289*ce8eb11aSdp78419 		goto fail;
290*ce8eb11aSdp78419 	}
291*ce8eb11aSdp78419 
292*ce8eb11aSdp78419 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
293*ce8eb11aSdp78419 	    "fwd", &lgrpnodes);
294*ce8eb11aSdp78419 
295*ce8eb11aSdp78419 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
296*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No Lgroups\n");
297*ce8eb11aSdp78419 		ret_val = -1;
298*ce8eb11aSdp78419 		goto fail;
299*ce8eb11aSdp78419 	}
300*ce8eb11aSdp78419 
301*ce8eb11aSdp78419 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
302*ce8eb11aSdp78419 
303*ce8eb11aSdp78419 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
304*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
305*ce8eb11aSdp78419 		    "in MD\n");
306*ce8eb11aSdp78419 		ret_val = -1;
307*ce8eb11aSdp78419 		goto fail;
308*ce8eb11aSdp78419 	}
309*ce8eb11aSdp78419 
310*ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
311*ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: md: %p\n", md);
312*ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
313*ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
314*ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
315*ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
316*ce8eb11aSdp78419 
317*ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
318*ce8eb11aSdp78419 		mpo_lgroup[i].node = lgrpnodes[i];
319*ce8eb11aSdp78419 		mpo_lgroup[i].id = i;
320*ce8eb11aSdp78419 		mpo_lgroup[i].ncpu = 0;
321*ce8eb11aSdp78419 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
322*ce8eb11aSdp78419 		    &mpo_lgroup[i].addr_mask);
323*ce8eb11aSdp78419 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
324*ce8eb11aSdp78419 		    &mpo_lgroup[i].addr_match);
325*ce8eb11aSdp78419 
326*ce8eb11aSdp78419 		/*
327*ce8eb11aSdp78419 		 * If either the mask or match properties are missing, set to 0
328*ce8eb11aSdp78419 		 */
329*ce8eb11aSdp78419 		if (result < 0) {
330*ce8eb11aSdp78419 			mpo_lgroup[i].addr_mask = 0;
331*ce8eb11aSdp78419 			mpo_lgroup[i].addr_match = 0;
332*ce8eb11aSdp78419 		}
333*ce8eb11aSdp78419 
334*ce8eb11aSdp78419 		/* Set latency to 0 if property not present */
335*ce8eb11aSdp78419 
336*ce8eb11aSdp78419 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
337*ce8eb11aSdp78419 		    &mpo_lgroup[i].latency);
338*ce8eb11aSdp78419 		if (result < 0)
339*ce8eb11aSdp78419 			mpo_lgroup[i].latency = 0;
340*ce8eb11aSdp78419 	}
341*ce8eb11aSdp78419 
342*ce8eb11aSdp78419 	/*
343*ce8eb11aSdp78419 	 * Sub-page level interleave is not yet supported.  Check for it,
344*ce8eb11aSdp78419 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
345*ce8eb11aSdp78419 	 * n_lgrpnodes.  If no lgroups are left, return.
346*ce8eb11aSdp78419 	 */
347*ce8eb11aSdp78419 
348*ce8eb11aSdp78419 	sub_page_fix = fix_interleave();
349*ce8eb11aSdp78419 	if (n_lgrpnodes == 0) {
350*ce8eb11aSdp78419 		ret_val = -1;
351*ce8eb11aSdp78419 		goto fail;
352*ce8eb11aSdp78419 	}
353*ce8eb11aSdp78419 
354*ce8eb11aSdp78419 	/* Ensure that all of the addr_mask values are the same */
355*ce8eb11aSdp78419 
356*ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
357*ce8eb11aSdp78419 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
358*ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: "
359*ce8eb11aSdp78419 			    "addr_mask values are not the same\n");
360*ce8eb11aSdp78419 			ret_val = -1;
361*ce8eb11aSdp78419 			goto fail;
362*ce8eb11aSdp78419 		}
363*ce8eb11aSdp78419 	}
364*ce8eb11aSdp78419 
365*ce8eb11aSdp78419 	/*
366*ce8eb11aSdp78419 	 * Ensure that all lgrp nodes see all the mblocks. However, if
367*ce8eb11aSdp78419 	 * sub-page interleave is being fixed, they do not, so skip
368*ce8eb11aSdp78419 	 * the check.
369*ce8eb11aSdp78419 	 */
370*ce8eb11aSdp78419 
371*ce8eb11aSdp78419 	if (sub_page_fix == 0) {
372*ce8eb11aSdp78419 		for (i = 0; i < n_lgrpnodes; i++) {
373*ce8eb11aSdp78419 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
374*ce8eb11aSdp78419 			    PROP_LG_MBLOCK, "fwd", &nodes);
375*ce8eb11aSdp78419 			md_free_scan_dag(md, &nodes);
376*ce8eb11aSdp78419 			if (j != n_mblocks) {
377*ce8eb11aSdp78419 				MPO_STATUS("lgrp_traverse: "
378*ce8eb11aSdp78419 				    "sub-page interleave is being fixed\n");
379*ce8eb11aSdp78419 				ret_val = -1;
380*ce8eb11aSdp78419 				goto fail;
381*ce8eb11aSdp78419 			}
382*ce8eb11aSdp78419 		}
383*ce8eb11aSdp78419 	}
384*ce8eb11aSdp78419 
385*ce8eb11aSdp78419 	/*
386*ce8eb11aSdp78419 	 * Use the address mask from the first lgroup node
387*ce8eb11aSdp78419 	 * to establish our home_mask.
388*ce8eb11aSdp78419 	 */
389*ce8eb11aSdp78419 	home_mask = mpo_lgroup[0].addr_mask;
390*ce8eb11aSdp78419 	home_mask_pfn = btop(home_mask);
391*ce8eb11aSdp78419 	home_mask_shift = lowbit(home_mask) - 1;
392*ce8eb11aSdp78419 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
393*ce8eb11aSdp78419 	mnode_pages = btop(1ULL << home_mask_shift);
394*ce8eb11aSdp78419 
395*ce8eb11aSdp78419 	/*
396*ce8eb11aSdp78419 	 * How many values are possible in home mask?  Assume the mask
397*ce8eb11aSdp78419 	 * bits are contiguous.
398*ce8eb11aSdp78419 	 */
399*ce8eb11aSdp78419 	max_locality_groups =
400*ce8eb11aSdp78419 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
401*ce8eb11aSdp78419 
402*ce8eb11aSdp78419 	/* Now verify the home mask bits are contiguous */
403*ce8eb11aSdp78419 
404*ce8eb11aSdp78419 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
405*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: "
406*ce8eb11aSdp78419 		    "home mask bits are not contiguous\n");
407*ce8eb11aSdp78419 		ret_val = -1;
408*ce8eb11aSdp78419 		goto fail;
409*ce8eb11aSdp78419 	}
410*ce8eb11aSdp78419 
411*ce8eb11aSdp78419 	/* Record all of the home bits */
412*ce8eb11aSdp78419 
413*ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
414*ce8eb11aSdp78419 		HOMESET_ADD(mem_lg_homeset,
415*ce8eb11aSdp78419 		    mpo_lgroup[i].addr_match >> home_mask_shift);
416*ce8eb11aSdp78419 	}
417*ce8eb11aSdp78419 
418*ce8eb11aSdp78419 	/* Count the number different "home"  mem_lg's we've discovered */
419*ce8eb11aSdp78419 
420*ce8eb11aSdp78419 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
421*ce8eb11aSdp78419 
422*ce8eb11aSdp78419 	/* If we have only 1 locality group then we can exit */
423*ce8eb11aSdp78419 	if (n_locality_groups == 1) {
424*ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
425*ce8eb11aSdp78419 		ret_val = -1;
426*ce8eb11aSdp78419 		goto fail;
427*ce8eb11aSdp78419 	}
428*ce8eb11aSdp78419 
429*ce8eb11aSdp78419 	/*
430*ce8eb11aSdp78419 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
431*ce8eb11aSdp78419 	 * latency found.  All other memory is considered remote, and the
432*ce8eb11aSdp78419 	 * remote latency is represented by the highest latency found.
433*ce8eb11aSdp78419 	 * Thus hierarchical lgroups, if any, are approximated by a
434*ce8eb11aSdp78419 	 * two level scheme.
435*ce8eb11aSdp78419 	 *
436*ce8eb11aSdp78419 	 * The Solaris MPO framework by convention wants to see latencies
437*ce8eb11aSdp78419 	 * in units of nano-sec/10. In the MD, the units are defined to be
438*ce8eb11aSdp78419 	 * pico-seconds.
439*ce8eb11aSdp78419 	 */
440*ce8eb11aSdp78419 
441*ce8eb11aSdp78419 	lower_latency = mpo_lgroup[0].latency;
442*ce8eb11aSdp78419 	higher_latency = mpo_lgroup[0].latency;
443*ce8eb11aSdp78419 
444*ce8eb11aSdp78419 	for (i = 1; i < n_lgrpnodes; i++) {
445*ce8eb11aSdp78419 		if (mpo_lgroup[i].latency < lower_latency) {
446*ce8eb11aSdp78419 			lower_latency = mpo_lgroup[i].latency;
447*ce8eb11aSdp78419 		}
448*ce8eb11aSdp78419 		if (mpo_lgroup[i].latency > higher_latency) {
449*ce8eb11aSdp78419 			higher_latency = mpo_lgroup[i].latency;
450*ce8eb11aSdp78419 		}
451*ce8eb11aSdp78419 	}
452*ce8eb11aSdp78419 	lower_latency /= 10000;
453*ce8eb11aSdp78419 	higher_latency /= 10000;
454*ce8eb11aSdp78419 
455*ce8eb11aSdp78419 	/* Clear our CPU data */
456*ce8eb11aSdp78419 
457*ce8eb11aSdp78419 	for (i = 0; i < NCPU; i++) {
458*ce8eb11aSdp78419 		mpo_cpu[i].home = 0;
459*ce8eb11aSdp78419 		mpo_cpu[i].latency = (uint_t)(-1);
460*ce8eb11aSdp78419 	}
461*ce8eb11aSdp78419 
462*ce8eb11aSdp78419 	/* Build the CPU nodes */
463*ce8eb11aSdp78419 	for (i = 0; i < n_cpunodes; i++) {
464*ce8eb11aSdp78419 
465*ce8eb11aSdp78419 		/* Read in the lgroup nodes */
466*ce8eb11aSdp78419 
467*ce8eb11aSdp78419 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
468*ce8eb11aSdp78419 		if (result < 0) {
469*ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
470*ce8eb11aSdp78419 			ret_val = -1;
471*ce8eb11aSdp78419 			goto fail;
472*ce8eb11aSdp78419 		}
473*ce8eb11aSdp78419 
474*ce8eb11aSdp78419 		n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
475*ce8eb11aSdp78419 		    "fwd", &nodes);
476*ce8eb11aSdp78419 		if (n_lgroups <= 0) {
477*ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
478*ce8eb11aSdp78419 			ret_val = -1;
479*ce8eb11aSdp78419 			goto fail;
480*ce8eb11aSdp78419 		}
481*ce8eb11aSdp78419 
482*ce8eb11aSdp78419 		/*
483*ce8eb11aSdp78419 		 * Find the lgroup this cpu belongs to with the lowest latency.
484*ce8eb11aSdp78419 		 * Check all the lgrp nodes connected to this CPU to determine
485*ce8eb11aSdp78419 		 * which has the smallest latency.
486*ce8eb11aSdp78419 		 */
487*ce8eb11aSdp78419 
488*ce8eb11aSdp78419 		for (j = 0; j < n_lgroups; j++) {
489*ce8eb11aSdp78419 			for (o = 0; o < n_lgrpnodes; o++) {
490*ce8eb11aSdp78419 				if (nodes[j] == mpo_lgroup[o].node) {
491*ce8eb11aSdp78419 					if (mpo_lgroup[o].latency <
492*ce8eb11aSdp78419 					    mpo_cpu[k].latency) {
493*ce8eb11aSdp78419 						mpo_cpu[k].home =
494*ce8eb11aSdp78419 						    mpo_lgroup[o].addr_match
495*ce8eb11aSdp78419 						    >> home_mask_shift;
496*ce8eb11aSdp78419 						mpo_cpu[k].latency =
497*ce8eb11aSdp78419 						    mpo_lgroup[o].latency;
498*ce8eb11aSdp78419 						mpo_lgroup[o].ncpu++;
499*ce8eb11aSdp78419 					}
500*ce8eb11aSdp78419 				}
501*ce8eb11aSdp78419 			}
502*ce8eb11aSdp78419 		}
503*ce8eb11aSdp78419 		md_free_scan_dag(md, &nodes);
504*ce8eb11aSdp78419 	}
505*ce8eb11aSdp78419 
506*ce8eb11aSdp78419 	/* Validate that no large pages cross mnode boundaries. */
507*ce8eb11aSdp78419 	if (valid_pages(md, cpunodes[0]) == 0) {
508*ce8eb11aSdp78419 		ret_val = -1;
509*ce8eb11aSdp78419 		goto fail;
510*ce8eb11aSdp78419 	}
511*ce8eb11aSdp78419 
512*ce8eb11aSdp78419 fail:
513*ce8eb11aSdp78419 	/* MD cookies are no longer valid; ensure they are not used again. */
514*ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++)
515*ce8eb11aSdp78419 		mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
516*ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++)
517*ce8eb11aSdp78419 		mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
518*ce8eb11aSdp78419 
519*ce8eb11aSdp78419 	if (n_cpunodes > 0)
520*ce8eb11aSdp78419 		md_free_scan_dag(md, &cpunodes);
521*ce8eb11aSdp78419 	if (n_lgrpnodes > 0)
522*ce8eb11aSdp78419 		md_free_scan_dag(md, &lgrpnodes);
523*ce8eb11aSdp78419 	if (n_mblocks > 0)
524*ce8eb11aSdp78419 		md_free_scan_dag(md, &mblocknodes);
525*ce8eb11aSdp78419 	else
526*ce8eb11aSdp78419 		panic("lgrp_traverse: No memory blocks found");
527*ce8eb11aSdp78419 
528*ce8eb11aSdp78419 	if (ret_val == 0)
529*ce8eb11aSdp78419 		MPO_STATUS("MPO feature is enabled.\n");
530*ce8eb11aSdp78419 
531*ce8eb11aSdp78419 	return (ret_val);
532*ce8eb11aSdp78419 }
533*ce8eb11aSdp78419 
534*ce8eb11aSdp78419 /*
535*ce8eb11aSdp78419  *  Determine the number of unique mem_lg's present in our system
536*ce8eb11aSdp78419  */
537*ce8eb11aSdp78419 static	int
538*ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
539*ce8eb11aSdp78419 {
540*ce8eb11aSdp78419 	int homeid;
541*ce8eb11aSdp78419 	int count = 0;
542*ce8eb11aSdp78419 
543*ce8eb11aSdp78419 	/*
544*ce8eb11aSdp78419 	 * Scan the "home" bits of the mem_lgs, count
545*ce8eb11aSdp78419 	 * the number that are unique.
546*ce8eb11aSdp78419 	 */
547*ce8eb11aSdp78419 
548*ce8eb11aSdp78419 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
549*ce8eb11aSdp78419 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
550*ce8eb11aSdp78419 			count++;
551*ce8eb11aSdp78419 		}
552*ce8eb11aSdp78419 	}
553*ce8eb11aSdp78419 
554*ce8eb11aSdp78419 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
555*ce8eb11aSdp78419 	    mem_lg_homeset);
556*ce8eb11aSdp78419 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
557*ce8eb11aSdp78419 
558*ce8eb11aSdp78419 	/* Default must be at least one */
559*ce8eb11aSdp78419 	if (count == 0)
560*ce8eb11aSdp78419 		count = 1;
561*ce8eb11aSdp78419 
562*ce8eb11aSdp78419 	return (count);
563*ce8eb11aSdp78419 }
564*ce8eb11aSdp78419 
565*ce8eb11aSdp78419 /*
566*ce8eb11aSdp78419  * Platform specific lgroup initialization
567*ce8eb11aSdp78419  */
568*ce8eb11aSdp78419 void
569*ce8eb11aSdp78419 plat_lgrp_init(void)
570*ce8eb11aSdp78419 {
571*ce8eb11aSdp78419 	md_t *md;
572*ce8eb11aSdp78419 	int i, rc, ncpu_min;
573*ce8eb11aSdp78419 
574*ce8eb11aSdp78419 	/* Get the Machine Descriptor handle */
575*ce8eb11aSdp78419 
576*ce8eb11aSdp78419 	md = md_get_handle();
577*ce8eb11aSdp78419 
578*ce8eb11aSdp78419 	/* If not, we cannot continue */
579*ce8eb11aSdp78419 
580*ce8eb11aSdp78419 	if (md == NULL) {
581*ce8eb11aSdp78419 		panic("cannot access machine descriptor\n");
582*ce8eb11aSdp78419 	} else {
583*ce8eb11aSdp78419 		rc = lgrp_traverse(md);
584*ce8eb11aSdp78419 		(void) md_fini_handle(md);
585*ce8eb11aSdp78419 	}
586*ce8eb11aSdp78419 
587*ce8eb11aSdp78419 	/*
588*ce8eb11aSdp78419 	 * If we can't process the MD for lgroups then at least let the
589*ce8eb11aSdp78419 	 * system try to boot.  Assume we have one lgroup so that
590*ce8eb11aSdp78419 	 * when plat_build_mem_nodes is called, it will attempt to init
591*ce8eb11aSdp78419 	 * an mnode based on the supplied memory segment.
592*ce8eb11aSdp78419 	 */
593*ce8eb11aSdp78419 
594*ce8eb11aSdp78419 	if (rc == -1) {
595*ce8eb11aSdp78419 		home_mask_pfn = 0;
596*ce8eb11aSdp78419 		max_locality_groups = 1;
597*ce8eb11aSdp78419 		n_locality_groups = 1;
598*ce8eb11aSdp78419 		return;
599*ce8eb11aSdp78419 	}
600*ce8eb11aSdp78419 
601*ce8eb11aSdp78419 	mem_node_pfn_shift = 0;
602*ce8eb11aSdp78419 	mem_node_physalign = 0;
603*ce8eb11aSdp78419 
604*ce8eb11aSdp78419 	/* Use lgroup-aware TSB allocations */
605*ce8eb11aSdp78419 	tsb_lgrp_affinity = 1;
606*ce8eb11aSdp78419 
607*ce8eb11aSdp78419 	/*
608*ce8eb11aSdp78419 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
609*ce8eb11aSdp78419 	 * this process is currently running on before considering
610*ce8eb11aSdp78419 	 * expanding threads to another lgroup.
611*ce8eb11aSdp78419 	 *
612*ce8eb11aSdp78419 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
613*ce8eb11aSdp78419 	 * must be loaded before expanding to it.
614*ce8eb11aSdp78419 	 *
615*ce8eb11aSdp78419 	 * On sun4v CMT processors, threads share a core pipeline, and
616*ce8eb11aSdp78419 	 * at less than 100% utilization, best throughput is obtained by
617*ce8eb11aSdp78419 	 * spreading threads across more cores, even if some are in a
618*ce8eb11aSdp78419 	 * different lgroup.  Spread threads to a new lgroup if the
619*ce8eb11aSdp78419 	 * current group is more than 50% loaded.  Because of virtualization,
620*ce8eb11aSdp78419 	 * lgroups may have different numbers of CPUs, but the tunables
621*ce8eb11aSdp78419 	 * apply to all lgroups, so find the smallest lgroup and compute
622*ce8eb11aSdp78419 	 * 50% loading.
623*ce8eb11aSdp78419 	 */
624*ce8eb11aSdp78419 
625*ce8eb11aSdp78419 	ncpu_min = NCPU;
626*ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
627*ce8eb11aSdp78419 		int ncpu = mpo_lgroup[i].ncpu;
628*ce8eb11aSdp78419 		if (ncpu != 0 && ncpu < ncpu_min)
629*ce8eb11aSdp78419 			ncpu_min = ncpu;
630*ce8eb11aSdp78419 	}
631*ce8eb11aSdp78419 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
632*ce8eb11aSdp78419 
633*ce8eb11aSdp78419 	/* new home may only be half as loaded as the existing home to use it */
634*ce8eb11aSdp78419 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
635*ce8eb11aSdp78419 
636*ce8eb11aSdp78419 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
637*ce8eb11aSdp78419 
638*ce8eb11aSdp78419 	/* Require that a home lgroup have some memory to be chosen */
639*ce8eb11aSdp78419 	lgrp_mem_free_thresh = 1;
640*ce8eb11aSdp78419 
641*ce8eb11aSdp78419 	/* Standard home-on-next-touch policy */
642*ce8eb11aSdp78419 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
643*ce8eb11aSdp78419 
644*ce8eb11aSdp78419 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
645*ce8eb11aSdp78419 	lgrp_load_thresh = UINT32_MAX;
646*ce8eb11aSdp78419 }
647*ce8eb11aSdp78419 
648*ce8eb11aSdp78419 /*
649*ce8eb11aSdp78419  *  Helper routine for debugging calls to mem_node_add_slice()
650*ce8eb11aSdp78419  */
651*ce8eb11aSdp78419 static	void
652*ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
653*ce8eb11aSdp78419 {
654*ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint)
655*ce8eb11aSdp78419 	static int slice_count = 0;
656*ce8eb11aSdp78419 
657*ce8eb11aSdp78419 	slice_count++;
658*ce8eb11aSdp78419 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
659*ce8eb11aSdp78419 	    slice_count, basepfn, endpfn);
660*ce8eb11aSdp78419 #endif
661*ce8eb11aSdp78419 	mem_node_add_slice(basepfn, endpfn);
662*ce8eb11aSdp78419 }
663*ce8eb11aSdp78419 
664*ce8eb11aSdp78419 /*
665*ce8eb11aSdp78419  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
666*ce8eb11aSdp78419  */
667*ce8eb11aSdp78419 static	void
668*ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
669*ce8eb11aSdp78419 {
670*ce8eb11aSdp78419 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
671*ce8eb11aSdp78419 	    "mnode index: %d\n", plathand, mnode);
672*ce8eb11aSdp78419 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
673*ce8eb11aSdp78419 }
674*ce8eb11aSdp78419 
675*ce8eb11aSdp78419 /*
676*ce8eb11aSdp78419  * plat_build_mem_nodes()
677*ce8eb11aSdp78419  *
678*ce8eb11aSdp78419  * Define the mem_nodes based on the modified boot memory list,
679*ce8eb11aSdp78419  * or based on info read from the MD in plat_lgrp_init().
680*ce8eb11aSdp78419  *
681*ce8eb11aSdp78419  * When the home mask lies in the middle of the address bits (as it does on
682*ce8eb11aSdp78419  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
683*ce8eb11aSdp78419  * it is striped across an mblock in a repeating pattern of contiguous memory
684*ce8eb11aSdp78419  * followed by a gap.  The stripe width is the size of the contiguous piece.
685*ce8eb11aSdp78419  * The stride is the distance from the start of one contiguous piece to the
686*ce8eb11aSdp78419  * start of the next.  The gap is thus stride - stripe_width.
687*ce8eb11aSdp78419  *
688*ce8eb11aSdp78419  * The stripe of an mnode that falls within an mblock is described by the type
689*ce8eb11aSdp78419  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
690*ce8eb11aSdp78419  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
691*ce8eb11aSdp78419  * this array is predetermined.  The mem_stripe_t that describes mnode m
692*ce8eb11aSdp78419  * within mpo_mblock[i] is stored at
693*ce8eb11aSdp78419  *	 mem_stripes[ m + i * max_locality_groups ]
694*ce8eb11aSdp78419  *
695*ce8eb11aSdp78419  * max_locality_groups is the total number of possible locality groups,
696*ce8eb11aSdp78419  * as defined by the size of the home mask, even if the memory assigned
697*ce8eb11aSdp78419  * to the domain is small and does not cover all the lgroups.  Thus some
698*ce8eb11aSdp78419  * mem_stripe_t's may be empty.
699*ce8eb11aSdp78419  *
700*ce8eb11aSdp78419  * The members of mem_stripe_t are:
701*ce8eb11aSdp78419  *	physbase: First valid page in mem_node in the corresponding mblock
702*ce8eb11aSdp78419  *	physmax: Last valid page in mem_node in mblock
703*ce8eb11aSdp78419  *	offset:  The full stripe width starts at physbase - offset.
704*ce8eb11aSdp78419  *	    Thus if offset is non-zero, this mem_node starts in the middle
705*ce8eb11aSdp78419  *	    of a stripe width, and the second full stripe starts at
706*ce8eb11aSdp78419  *	    physbase - offset + stride.  (even though physmax may fall in the
707*ce8eb11aSdp78419  *	    middle of a stripe width, we do not save the ending fragment size
708*ce8eb11aSdp78419  *	    in this data structure.)
709*ce8eb11aSdp78419  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
710*ce8eb11aSdp78419  *
711*ce8eb11aSdp78419  *	The stripe width is kept in the global mnode_pages.
712*ce8eb11aSdp78419  *	The stride is kept in the global mnode_stride.
713*ce8eb11aSdp78419  *	All the above use pfn's as the unit.
714*ce8eb11aSdp78419  *
715*ce8eb11aSdp78419  * As an example, the memory layout for a domain with 2 mblocks and 4
716*ce8eb11aSdp78419  * mem_nodes 0,1,2,3 could look like this:
717*ce8eb11aSdp78419  *
718*ce8eb11aSdp78419  *	123012301230 ...	012301230123 ...
719*ce8eb11aSdp78419  *	  mblock 0		  mblock 1
720*ce8eb11aSdp78419  */
721*ce8eb11aSdp78419 
722*ce8eb11aSdp78419 void
723*ce8eb11aSdp78419 plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
724*ce8eb11aSdp78419 {
725*ce8eb11aSdp78419 	lgrp_handle_t lgrphand, lgrp_start;
726*ce8eb11aSdp78419 	int i, mnode, elem;
727*ce8eb11aSdp78419 	uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
728*ce8eb11aSdp78419 	uint64_t stripe, frag, remove;
729*ce8eb11aSdp78419 	mem_stripe_t *ms;
730*ce8eb11aSdp78419 
731*ce8eb11aSdp78419 	/* Check for non-MPO sun4v platforms */
732*ce8eb11aSdp78419 
733*ce8eb11aSdp78419 	if (n_locality_groups <= 1) {
734*ce8eb11aSdp78419 		mpo_plat_assign_lgrphand_to_mem_node((lgrp_handle_t)0, 0);
735*ce8eb11aSdp78419 		for (elem = 0; elem < nelems; elem += 2) {
736*ce8eb11aSdp78419 			base = list[elem];
737*ce8eb11aSdp78419 			len = list[elem+1];
738*ce8eb11aSdp78419 
739*ce8eb11aSdp78419 			mpo_mem_node_add_slice(btop(base),
740*ce8eb11aSdp78419 			    btop(base + len - 1));
741*ce8eb11aSdp78419 		}
742*ce8eb11aSdp78419 		mem_node_pfn_shift = 0;
743*ce8eb11aSdp78419 		mem_node_physalign = 0;
744*ce8eb11aSdp78419 		n_mem_stripes = 0;
745*ce8eb11aSdp78419 		return;
746*ce8eb11aSdp78419 	}
747*ce8eb11aSdp78419 
748*ce8eb11aSdp78419 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
749*ce8eb11aSdp78419 	max_mem_nodes = max_locality_groups;
750*ce8eb11aSdp78419 	bzero(mem_stripes, sizeof (mem_stripes));
751*ce8eb11aSdp78419 	stripe = ptob(mnode_pages);
752*ce8eb11aSdp78419 	stride = max_locality_groups * stripe;
753*ce8eb11aSdp78419 
754*ce8eb11aSdp78419 	/* Save commonly used values in globals */
755*ce8eb11aSdp78419 	mnode_stride = btop(stride);
756*ce8eb11aSdp78419 	n_mem_stripes = max_locality_groups * n_mblocks;
757*ce8eb11aSdp78419 	stripe_shift = highbit(max_locality_groups) - 1;
758*ce8eb11aSdp78419 
759*ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
760*ce8eb11aSdp78419 
761*ce8eb11aSdp78419 		base = mpo_mblock[i].base;
762*ce8eb11aSdp78419 		end = mpo_mblock[i].base + mpo_mblock[i].size;
763*ce8eb11aSdp78419 		ra_to_pa = mpo_mblock[i].ra_to_pa;
764*ce8eb11aSdp78419 		mpo_mblock[i].base_pfn = btop(base);
765*ce8eb11aSdp78419 		mpo_mblock[i].end_pfn = btop(end - 1);
766*ce8eb11aSdp78419 
767*ce8eb11aSdp78419 		/* Find the offset from the prev stripe boundary in PA space. */
768*ce8eb11aSdp78419 		offset = (base + ra_to_pa) & (stripe - 1);
769*ce8eb11aSdp78419 
770*ce8eb11aSdp78419 		/* Set the next stripe boundary. */
771*ce8eb11aSdp78419 		stripe_end = base - offset + stripe;
772*ce8eb11aSdp78419 
773*ce8eb11aSdp78419 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
774*ce8eb11aSdp78419 		    home_mask_shift);
775*ce8eb11aSdp78419 		lgrphand = lgrp_start;
776*ce8eb11aSdp78419 
777*ce8eb11aSdp78419 		/*
778*ce8eb11aSdp78419 		 * Loop over all lgroups covered by the mblock, creating a
779*ce8eb11aSdp78419 		 * stripe for each.  Stop when lgrp_start is visited again.
780*ce8eb11aSdp78419 		 */
781*ce8eb11aSdp78419 		do {
782*ce8eb11aSdp78419 			/* mblock may not span all lgroups */
783*ce8eb11aSdp78419 			if (base >= end)
784*ce8eb11aSdp78419 				break;
785*ce8eb11aSdp78419 
786*ce8eb11aSdp78419 			mnode = lgrphand;
787*ce8eb11aSdp78419 			ASSERT(mnode < max_mem_nodes);
788*ce8eb11aSdp78419 
789*ce8eb11aSdp78419 			/*
790*ce8eb11aSdp78419 			 * Calculate the size of the fragment that does not
791*ce8eb11aSdp78419 			 * belong to the mnode in the last partial stride.
792*ce8eb11aSdp78419 			 */
793*ce8eb11aSdp78419 			frag = (end - (base - offset)) & (stride - 1);
794*ce8eb11aSdp78419 			if (frag == 0) {
795*ce8eb11aSdp78419 				/* remove the gap */
796*ce8eb11aSdp78419 				remove = stride - stripe;
797*ce8eb11aSdp78419 			} else if (frag < stripe) {
798*ce8eb11aSdp78419 				/* fragment fits in stripe; keep it all */
799*ce8eb11aSdp78419 				remove = 0;
800*ce8eb11aSdp78419 			} else {
801*ce8eb11aSdp78419 				/* fragment is large; trim after whole stripe */
802*ce8eb11aSdp78419 				remove = frag - stripe;
803*ce8eb11aSdp78419 			}
804*ce8eb11aSdp78419 
805*ce8eb11aSdp78419 			ms = &mem_stripes[i * max_locality_groups + mnode];
806*ce8eb11aSdp78419 			ms->physbase = btop(base);
807*ce8eb11aSdp78419 			ms->physmax = btop(end - 1 - remove);
808*ce8eb11aSdp78419 			ms->offset = btop(offset);
809*ce8eb11aSdp78419 			ms->exists = 1;
810*ce8eb11aSdp78419 
811*ce8eb11aSdp78419 			mpo_plat_assign_lgrphand_to_mem_node(lgrphand, mnode);
812*ce8eb11aSdp78419 			mpo_mem_node_add_slice(ms->physbase, ms->physmax);
813*ce8eb11aSdp78419 
814*ce8eb11aSdp78419 			base = stripe_end;
815*ce8eb11aSdp78419 			stripe_end += stripe;
816*ce8eb11aSdp78419 			offset = 0;
817*ce8eb11aSdp78419 			lgrphand = (((base + ra_to_pa) & home_mask) >>
818*ce8eb11aSdp78419 			    home_mask_shift);
819*ce8eb11aSdp78419 		} while (lgrphand != lgrp_start);
820*ce8eb11aSdp78419 	}
821*ce8eb11aSdp78419 
822*ce8eb11aSdp78419 	/*
823*ce8eb11aSdp78419 	 * Indicate to vm_pagelist that the hpm_counters array
824*ce8eb11aSdp78419 	 * should be shared because the ranges overlap.
825*ce8eb11aSdp78419 	 */
826*ce8eb11aSdp78419 	if (max_mem_nodes > 1) {
827*ce8eb11aSdp78419 		interleaved_mnodes = 1;
828*ce8eb11aSdp78419 	}
829*ce8eb11aSdp78419 }
830*ce8eb11aSdp78419 
831*ce8eb11aSdp78419 /*
832*ce8eb11aSdp78419  * Return the locality group value for the supplied processor
833*ce8eb11aSdp78419  */
834*ce8eb11aSdp78419 lgrp_handle_t
835*ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id)
836*ce8eb11aSdp78419 {
837*ce8eb11aSdp78419 	if (n_locality_groups > 1) {
838*ce8eb11aSdp78419 		return ((lgrp_handle_t)mpo_cpu[(int)id].home);
839*ce8eb11aSdp78419 	} else {
840*ce8eb11aSdp78419 		return ((lgrp_handle_t)0); /* Default */
841*ce8eb11aSdp78419 	}
842*ce8eb11aSdp78419 }
843*ce8eb11aSdp78419 
844*ce8eb11aSdp78419 int
845*ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
846*ce8eb11aSdp78419 {
847*ce8eb11aSdp78419 	/*
848*ce8eb11aSdp78419 	 * Return min remote latency when there are more than two lgroups
849*ce8eb11aSdp78419 	 * (root and child) and getting latency between two different lgroups
850*ce8eb11aSdp78419 	 * or root is involved.
851*ce8eb11aSdp78419 	 */
852*ce8eb11aSdp78419 	if (lgrp_optimizations() && (from != to ||
853*ce8eb11aSdp78419 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
854*ce8eb11aSdp78419 		return ((int)higher_latency);
855*ce8eb11aSdp78419 	} else {
856*ce8eb11aSdp78419 		return ((int)lower_latency);
857*ce8eb11aSdp78419 	}
858*ce8eb11aSdp78419 }
859*ce8eb11aSdp78419 
860*ce8eb11aSdp78419 int
861*ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn)
862*ce8eb11aSdp78419 {
863*ce8eb11aSdp78419 	int i, mnode;
864*ce8eb11aSdp78419 	pfn_t ra_to_pa_pfn;
865*ce8eb11aSdp78419 	struct mblock_md *mb;
866*ce8eb11aSdp78419 
867*ce8eb11aSdp78419 	if (n_locality_groups <= 1)
868*ce8eb11aSdp78419 		return (0);
869*ce8eb11aSdp78419 
870*ce8eb11aSdp78419 	/*
871*ce8eb11aSdp78419 	 * The mnode is defined to be 1:1 with the lgroup handle, which
872*ce8eb11aSdp78419 	 * is taken from from the home bits.  Find the mblock in which
873*ce8eb11aSdp78419 	 * the pfn falls to get the ra_to_pa adjustment, and extract
874*ce8eb11aSdp78419 	 * the home bits.
875*ce8eb11aSdp78419 	 */
876*ce8eb11aSdp78419 	mb = &mpo_mblock[0];
877*ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
878*ce8eb11aSdp78419 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
879*ce8eb11aSdp78419 			ra_to_pa_pfn = btop(mb->ra_to_pa);
880*ce8eb11aSdp78419 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
881*ce8eb11aSdp78419 			    home_mask_pfn_shift);
882*ce8eb11aSdp78419 			ASSERT(mnode < max_mem_nodes);
883*ce8eb11aSdp78419 			return (mnode);
884*ce8eb11aSdp78419 		}
885*ce8eb11aSdp78419 		mb++;
886*ce8eb11aSdp78419 	}
887*ce8eb11aSdp78419 
888*ce8eb11aSdp78419 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
889*ce8eb11aSdp78419 	return (pfn);
890*ce8eb11aSdp78419 }
891*ce8eb11aSdp78419 
892*ce8eb11aSdp78419 /*
893*ce8eb11aSdp78419  * plat_rapfn_to_papfn
894*ce8eb11aSdp78419  *
895*ce8eb11aSdp78419  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
896*ce8eb11aSdp78419  * and home mask bits are correct.  The upper bits do not necessarily
897*ce8eb11aSdp78419  * match the actual PA, however.
898*ce8eb11aSdp78419  */
899*ce8eb11aSdp78419 pfn_t
900*ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn)
901*ce8eb11aSdp78419 {
902*ce8eb11aSdp78419 	int i;
903*ce8eb11aSdp78419 	pfn_t ra_to_pa_pfn;
904*ce8eb11aSdp78419 	struct mblock_md *mb;
905*ce8eb11aSdp78419 
906*ce8eb11aSdp78419 	ASSERT(n_mblocks > 0);
907*ce8eb11aSdp78419 	if (n_mblocks == 1)
908*ce8eb11aSdp78419 		return (pfn + base_ra_to_pa_pfn);
909*ce8eb11aSdp78419 
910*ce8eb11aSdp78419 	/*
911*ce8eb11aSdp78419 	 * Find the mblock in which the pfn falls
912*ce8eb11aSdp78419 	 * in order to get the ra_to_pa adjustment.
913*ce8eb11aSdp78419 	 */
914*ce8eb11aSdp78419 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
915*ce8eb11aSdp78419 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
916*ce8eb11aSdp78419 			ra_to_pa_pfn = btop(mb->ra_to_pa);
917*ce8eb11aSdp78419 			return (pfn + ra_to_pa_pfn);
918*ce8eb11aSdp78419 		}
919*ce8eb11aSdp78419 	}
920*ce8eb11aSdp78419 
921*ce8eb11aSdp78419 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
922*ce8eb11aSdp78419 	return (pfn);
923*ce8eb11aSdp78419 }
924*ce8eb11aSdp78419 
925*ce8eb11aSdp78419 /*
926*ce8eb11aSdp78419  * plat_mem_node_iterator_init()
927*ce8eb11aSdp78419  *	Initialize cookie to iterate over pfn's in an mnode.  There is
928*ce8eb11aSdp78419  *	no additional iterator function.  The caller uses the info from
929*ce8eb11aSdp78419  *	the iterator structure directly.
930*ce8eb11aSdp78419  *
931*ce8eb11aSdp78419  *	pfn: starting pfn.
932*ce8eb11aSdp78419  * 	mnode: desired mnode.
933*ce8eb11aSdp78419  *	init: set to 1 for full init, 0 for continuation
934*ce8eb11aSdp78419  *
935*ce8eb11aSdp78419  *	Returns the appropriate starting pfn for the iteration
936*ce8eb11aSdp78419  *	the same as the input pfn if it falls in an mblock.
937*ce8eb11aSdp78419  *	Returns the (pfn_t)-1 value if the input pfn lies past
938*ce8eb11aSdp78419  *	the last valid mnode pfn.
939*ce8eb11aSdp78419  */
940*ce8eb11aSdp78419 pfn_t
941*ce8eb11aSdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode,
942*ce8eb11aSdp78419     mem_node_iterator_t *it, int init)
943*ce8eb11aSdp78419 {
944*ce8eb11aSdp78419 	int i;
945*ce8eb11aSdp78419 	struct mblock_md *mblock;
946*ce8eb11aSdp78419 	pfn_t base, end;
947*ce8eb11aSdp78419 
948*ce8eb11aSdp78419 	ASSERT(it != NULL);
949*ce8eb11aSdp78419 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
950*ce8eb11aSdp78419 	ASSERT(n_mblocks > 0);
951*ce8eb11aSdp78419 
952*ce8eb11aSdp78419 	if (init) {
953*ce8eb11aSdp78419 		it->mi_last_mblock = 0;
954*ce8eb11aSdp78419 		it->mi_init = 1;
955*ce8eb11aSdp78419 	}
956*ce8eb11aSdp78419 
957*ce8eb11aSdp78419 	/* Check if mpo is not enabled and we only have one mblock */
958*ce8eb11aSdp78419 	if (n_locality_groups == 1 && n_mblocks == 1) {
959*ce8eb11aSdp78419 		it->mi_mnode = mnode;
960*ce8eb11aSdp78419 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
961*ce8eb11aSdp78419 		it->mi_mnode_pfn_mask = 0;
962*ce8eb11aSdp78419 		it->mi_mnode_pfn_shift = 0;
963*ce8eb11aSdp78419 		it->mi_mnode_mask = 0;
964*ce8eb11aSdp78419 		it->mi_mblock_base = mem_node_config[mnode].physbase;
965*ce8eb11aSdp78419 		it->mi_mblock_end = mem_node_config[mnode].physmax;
966*ce8eb11aSdp78419 		if (pfn < it->mi_mblock_base)
967*ce8eb11aSdp78419 			pfn = it->mi_mblock_base;
968*ce8eb11aSdp78419 		else if (pfn > it->mi_mblock_end)
969*ce8eb11aSdp78419 			pfn = (pfn_t)-1;
970*ce8eb11aSdp78419 		return (pfn);
971*ce8eb11aSdp78419 	}
972*ce8eb11aSdp78419 
973*ce8eb11aSdp78419 	/*
974*ce8eb11aSdp78419 	 * Find mblock that contains pfn, or first mblock after pfn,
975*ce8eb11aSdp78419 	 * else pfn is out of bounds, so use the last mblock.
976*ce8eb11aSdp78419 	 * mblocks are sorted in ascending address order.
977*ce8eb11aSdp78419 	 */
978*ce8eb11aSdp78419 	ASSERT(it->mi_last_mblock < n_mblocks);
979*ce8eb11aSdp78419 	ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
980*ce8eb11aSdp78419 	i = init ? 0 : it->mi_last_mblock + 1;
981*ce8eb11aSdp78419 	if (i == n_mblocks)
982*ce8eb11aSdp78419 		return ((pfn_t)-1);
983*ce8eb11aSdp78419 
984*ce8eb11aSdp78419 	for (; i < n_mblocks; i++) {
985*ce8eb11aSdp78419 		if (pfn <= mpo_mblock[i].end_pfn)
986*ce8eb11aSdp78419 			break;
987*ce8eb11aSdp78419 	}
988*ce8eb11aSdp78419 	if (i == n_mblocks) {
989*ce8eb11aSdp78419 		it->mi_last_mblock = i - 1;
990*ce8eb11aSdp78419 		return ((pfn_t)-1);
991*ce8eb11aSdp78419 	}
992*ce8eb11aSdp78419 	it->mi_last_mblock = i;
993*ce8eb11aSdp78419 
994*ce8eb11aSdp78419 	/*
995*ce8eb11aSdp78419 	 * Memory stripes are defined if there is more than one locality
996*ce8eb11aSdp78419 	 * group, so use the stripe bounds.  Otherwise use mblock bounds.
997*ce8eb11aSdp78419 	 */
998*ce8eb11aSdp78419 	mblock = &mpo_mblock[i];
999*ce8eb11aSdp78419 	if (n_mem_stripes > 0) {
1000*ce8eb11aSdp78419 		mem_stripe_t *ms =
1001*ce8eb11aSdp78419 		    &mem_stripes[i * max_locality_groups + mnode];
1002*ce8eb11aSdp78419 		base = ms->physbase;
1003*ce8eb11aSdp78419 		end = ms->physmax;
1004*ce8eb11aSdp78419 	} else {
1005*ce8eb11aSdp78419 		ASSERT(mnode == 0);
1006*ce8eb11aSdp78419 		base = mblock->base_pfn;
1007*ce8eb11aSdp78419 		end = mblock->end_pfn;
1008*ce8eb11aSdp78419 	}
1009*ce8eb11aSdp78419 
1010*ce8eb11aSdp78419 	it->mi_mnode = mnode;
1011*ce8eb11aSdp78419 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1012*ce8eb11aSdp78419 	it->mi_mblock_base = base;
1013*ce8eb11aSdp78419 	it->mi_mblock_end = end;
1014*ce8eb11aSdp78419 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1015*ce8eb11aSdp78419 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1016*ce8eb11aSdp78419 	it->mi_mnode_mask = max_locality_groups - 1;
1017*ce8eb11aSdp78419 	if (pfn < base)
1018*ce8eb11aSdp78419 		pfn = base;
1019*ce8eb11aSdp78419 	else if (pfn > end)
1020*ce8eb11aSdp78419 		pfn = (pfn_t)-1;
1021*ce8eb11aSdp78419 	return (pfn);
1022*ce8eb11aSdp78419 }
1023*ce8eb11aSdp78419 
1024*ce8eb11aSdp78419 /*
1025*ce8eb11aSdp78419  * plat_mem_node_intersect_range()
1026*ce8eb11aSdp78419  *
1027*ce8eb11aSdp78419  * Find the intersection between a memnode and a range of pfn's.
1028*ce8eb11aSdp78419  */
1029*ce8eb11aSdp78419 void
1030*ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1031*ce8eb11aSdp78419     int mnode, pgcnt_t *npages_out)
1032*ce8eb11aSdp78419 {
1033*ce8eb11aSdp78419 	pfn_t offset, len, hole, base, end, test_end, frag;
1034*ce8eb11aSdp78419 	pfn_t nearest;
1035*ce8eb11aSdp78419 	mem_stripe_t *ms;
1036*ce8eb11aSdp78419 	int i, npages;
1037*ce8eb11aSdp78419 
1038*ce8eb11aSdp78419 	*npages_out = 0;
1039*ce8eb11aSdp78419 
1040*ce8eb11aSdp78419 	if (!mem_node_config[mnode].exists || test_len == 0)
1041*ce8eb11aSdp78419 		return;
1042*ce8eb11aSdp78419 
1043*ce8eb11aSdp78419 	base = mem_node_config[mnode].physbase;
1044*ce8eb11aSdp78419 	end = mem_node_config[mnode].physmax;
1045*ce8eb11aSdp78419 
1046*ce8eb11aSdp78419 	test_end = test_base + test_len - 1;
1047*ce8eb11aSdp78419 	if (end < test_base || base > test_end)
1048*ce8eb11aSdp78419 		return;
1049*ce8eb11aSdp78419 
1050*ce8eb11aSdp78419 	if (n_locality_groups == 1) {
1051*ce8eb11aSdp78419 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1052*ce8eb11aSdp78419 		return;
1053*ce8eb11aSdp78419 	}
1054*ce8eb11aSdp78419 
1055*ce8eb11aSdp78419 	hole = mnode_stride - mnode_pages;
1056*ce8eb11aSdp78419 	npages = 0;
1057*ce8eb11aSdp78419 
1058*ce8eb11aSdp78419 	/*
1059*ce8eb11aSdp78419 	 * Iterate over all the stripes for this mnode (one per mblock),
1060*ce8eb11aSdp78419 	 * find the intersection with each, and accumulate the intersections.
1061*ce8eb11aSdp78419 	 *
1062*ce8eb11aSdp78419 	 * Determing the intersection with a stripe is tricky.  If base or end
1063*ce8eb11aSdp78419 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1064*ce8eb11aSdp78419 	 * mem_node.  If base or end fall in a gap, round them to start of
1065*ce8eb11aSdp78419 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1066*ce8eb11aSdp78419 	 * but calculate the fragment size that should be excluded from the
1067*ce8eb11aSdp78419 	 * stripe.  Calculate how many strides fall in the adjusted range,
1068*ce8eb11aSdp78419 	 * multiply by stripe width, and add the start and end fragments.
1069*ce8eb11aSdp78419 	 */
1070*ce8eb11aSdp78419 
1071*ce8eb11aSdp78419 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1072*ce8eb11aSdp78419 		ms = &mem_stripes[i];
1073*ce8eb11aSdp78419 		if (ms->exists &&
1074*ce8eb11aSdp78419 		    test_base <= (end = ms->physmax) &&
1075*ce8eb11aSdp78419 		    test_end >= (base = ms->physbase)) {
1076*ce8eb11aSdp78419 
1077*ce8eb11aSdp78419 			offset = ms->offset;
1078*ce8eb11aSdp78419 
1079*ce8eb11aSdp78419 			if (test_base > base) {
1080*ce8eb11aSdp78419 				/* Round test_base to next multiple of stride */
1081*ce8eb11aSdp78419 				len = P2ROUNDUP(test_base - (base - offset),
1082*ce8eb11aSdp78419 				    mnode_stride);
1083*ce8eb11aSdp78419 				nearest = base - offset + len;
1084*ce8eb11aSdp78419 				/*
1085*ce8eb11aSdp78419 				 * Compute distance from test_base to the
1086*ce8eb11aSdp78419 				 * stride boundary to see if test_base falls
1087*ce8eb11aSdp78419 				 * in the stripe or in the hole.
1088*ce8eb11aSdp78419 				 */
1089*ce8eb11aSdp78419 				if (nearest - test_base > hole) {
1090*ce8eb11aSdp78419 					/*
1091*ce8eb11aSdp78419 					 * test_base lies in stripe,
1092*ce8eb11aSdp78419 					 * and offset should be excluded.
1093*ce8eb11aSdp78419 					 */
1094*ce8eb11aSdp78419 					offset = test_base -
1095*ce8eb11aSdp78419 					    (nearest - mnode_stride);
1096*ce8eb11aSdp78419 					base = test_base;
1097*ce8eb11aSdp78419 				} else {
1098*ce8eb11aSdp78419 					/* round up to next stripe start */
1099*ce8eb11aSdp78419 					offset = 0;
1100*ce8eb11aSdp78419 					base = nearest;
1101*ce8eb11aSdp78419 					if (base > end)
1102*ce8eb11aSdp78419 						continue;
1103*ce8eb11aSdp78419 				}
1104*ce8eb11aSdp78419 
1105*ce8eb11aSdp78419 			}
1106*ce8eb11aSdp78419 
1107*ce8eb11aSdp78419 			if (test_end < end)
1108*ce8eb11aSdp78419 				end = test_end;
1109*ce8eb11aSdp78419 			end++;		/* adjust to an exclusive bound */
1110*ce8eb11aSdp78419 
1111*ce8eb11aSdp78419 			/* Round end to next multiple of stride */
1112*ce8eb11aSdp78419 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1113*ce8eb11aSdp78419 			nearest = (base - offset) + len;
1114*ce8eb11aSdp78419 			if (nearest - end <= hole) {
1115*ce8eb11aSdp78419 				/* end falls in hole, use entire last stripe */
1116*ce8eb11aSdp78419 				frag = 0;
1117*ce8eb11aSdp78419 			} else {
1118*ce8eb11aSdp78419 				/* end falls in stripe, compute fragment */
1119*ce8eb11aSdp78419 				frag = nearest - hole - end;
1120*ce8eb11aSdp78419 			}
1121*ce8eb11aSdp78419 
1122*ce8eb11aSdp78419 			len = (len >> stripe_shift) - offset - frag;
1123*ce8eb11aSdp78419 			npages += len;
1124*ce8eb11aSdp78419 		}
1125*ce8eb11aSdp78419 	}
1126*ce8eb11aSdp78419 
1127*ce8eb11aSdp78419 	*npages_out = npages;
1128*ce8eb11aSdp78419 }
1129*ce8eb11aSdp78419 
1130*ce8eb11aSdp78419 /*
1131*ce8eb11aSdp78419  * valid_pages()
1132*ce8eb11aSdp78419  *
1133*ce8eb11aSdp78419  * Return 1 if pages are valid and do not cross mnode boundaries
1134*ce8eb11aSdp78419  * (which would break page free list assumptions), and 0 otherwise.
1135*ce8eb11aSdp78419  */
1136*ce8eb11aSdp78419 
1137*ce8eb11aSdp78419 #define	MNODE(pa)	\
1138*ce8eb11aSdp78419 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1139*ce8eb11aSdp78419 
1140*ce8eb11aSdp78419 static int
1141*ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0)
1142*ce8eb11aSdp78419 {
1143*ce8eb11aSdp78419 	int i, max_szc;
1144*ce8eb11aSdp78419 	uint64_t last_page_base, szc_mask;
1145*ce8eb11aSdp78419 	uint64_t max_page_len, max_coalesce_len;
1146*ce8eb11aSdp78419 	struct mblock_md *mb = mpo_mblock;
1147*ce8eb11aSdp78419 
1148*ce8eb11aSdp78419 	/*
1149*ce8eb11aSdp78419 	 * Find the smaller of the largest page possible and supported.
1150*ce8eb11aSdp78419 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1151*ce8eb11aSdp78419 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1152*ce8eb11aSdp78419 	 * to get a sane mask.
1153*ce8eb11aSdp78419 	 */
1154*ce8eb11aSdp78419 
1155*ce8eb11aSdp78419 	if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1156*ce8eb11aSdp78419 		szc_mask = 0;
1157*ce8eb11aSdp78419 	szc_mask |=  (1 << TTE4M);	/* largest in sun4v default support */
1158*ce8eb11aSdp78419 	max_szc = highbit(szc_mask) - 1;
1159*ce8eb11aSdp78419 	if (max_szc > TTE256M)
1160*ce8eb11aSdp78419 		max_szc = TTE256M;
1161*ce8eb11aSdp78419 	max_page_len = TTEBYTES(max_szc);
1162*ce8eb11aSdp78419 
1163*ce8eb11aSdp78419 	/*
1164*ce8eb11aSdp78419 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1165*ce8eb11aSdp78419 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1166*ce8eb11aSdp78419 	 * within one mnode to use MPO.
1167*ce8eb11aSdp78419 	 */
1168*ce8eb11aSdp78419 	max_coalesce_len = TTEBYTES(TTE256M);
1169*ce8eb11aSdp78419 	ASSERT(max_coalesce_len >= max_page_len);
1170*ce8eb11aSdp78419 
1171*ce8eb11aSdp78419 	if (ptob(mnode_pages) < max_coalesce_len) {
1172*ce8eb11aSdp78419 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1173*ce8eb11aSdp78419 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1174*ce8eb11aSdp78419 		return (0);
1175*ce8eb11aSdp78419 	}
1176*ce8eb11aSdp78419 
1177*ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
1178*ce8eb11aSdp78419 		uint64_t base = mb->base;
1179*ce8eb11aSdp78419 		uint64_t end = mb->base + mb->size - 1;
1180*ce8eb11aSdp78419 		uint64_t ra_to_pa = mb->ra_to_pa;
1181*ce8eb11aSdp78419 
1182*ce8eb11aSdp78419 		/*
1183*ce8eb11aSdp78419 		 * If mblock is smaller than the max page size, then
1184*ce8eb11aSdp78419 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1185*ce8eb11aSdp78419 		 * not span mnodes.
1186*ce8eb11aSdp78419 		 */
1187*ce8eb11aSdp78419 		if (mb->size < max_page_len) {
1188*ce8eb11aSdp78419 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1189*ce8eb11aSdp78419 				MPO_STATUS("Small mblock spans mnodes; "
1190*ce8eb11aSdp78419 				    "MPO disabled: base = %lx, end = %lx, "
1191*ce8eb11aSdp78419 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1192*ce8eb11aSdp78419 				return (0);
1193*ce8eb11aSdp78419 			}
1194*ce8eb11aSdp78419 		} else {
1195*ce8eb11aSdp78419 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1196*ce8eb11aSdp78419 			uint64_t pa_base = base + ra_to_pa;
1197*ce8eb11aSdp78419 			if ((base & (max_coalesce_len - 1)) !=
1198*ce8eb11aSdp78419 			    (pa_base & (max_coalesce_len - 1))) {
1199*ce8eb11aSdp78419 				MPO_STATUS("bad page alignment; MPO disabled: "
1200*ce8eb11aSdp78419 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1201*ce8eb11aSdp78419 				    base, pa_base, max_coalesce_len);
1202*ce8eb11aSdp78419 				return (0);
1203*ce8eb11aSdp78419 			}
1204*ce8eb11aSdp78419 		}
1205*ce8eb11aSdp78419 
1206*ce8eb11aSdp78419 		/*
1207*ce8eb11aSdp78419 		 * Find start of last large page in mblock in RA space.
1208*ce8eb11aSdp78419 		 * If page extends into the next mblock, verify the
1209*ce8eb11aSdp78419 		 * mnode does not change.
1210*ce8eb11aSdp78419 		 */
1211*ce8eb11aSdp78419 		last_page_base = P2ALIGN(end, max_coalesce_len);
1212*ce8eb11aSdp78419 		if (i + 1 < n_mblocks &&
1213*ce8eb11aSdp78419 		    last_page_base + max_coalesce_len > mb[1].base &&
1214*ce8eb11aSdp78419 		    MNODE(last_page_base + ra_to_pa) !=
1215*ce8eb11aSdp78419 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1216*ce8eb11aSdp78419 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1217*ce8eb11aSdp78419 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1218*ce8eb11aSdp78419 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1219*ce8eb11aSdp78419 			    mb[1].ra_to_pa, max_coalesce_len);
1220*ce8eb11aSdp78419 			return (0);
1221*ce8eb11aSdp78419 		}
1222*ce8eb11aSdp78419 
1223*ce8eb11aSdp78419 		mb++;
1224*ce8eb11aSdp78419 	}
1225*ce8eb11aSdp78419 	return (1);
1226*ce8eb11aSdp78419 }
1227*ce8eb11aSdp78419 
1228*ce8eb11aSdp78419 
1229*ce8eb11aSdp78419 /*
1230*ce8eb11aSdp78419  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1231*ce8eb11aSdp78419  * if any, and remove them.  This yields a config where the "coarse
1232*ce8eb11aSdp78419  * grained" lgroups cover all of memory, even though part of that memory
1233*ce8eb11aSdp78419  * is fine grain interleaved and does not deliver a purely local memory
1234*ce8eb11aSdp78419  * latency.
1235*ce8eb11aSdp78419  *
1236*ce8eb11aSdp78419  * This function reads and modifies the globals:
1237*ce8eb11aSdp78419  *	mpo_lgroup[], n_lgrpnodes
1238*ce8eb11aSdp78419  *
1239*ce8eb11aSdp78419  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1240*ce8eb11aSdp78419  */
1241*ce8eb11aSdp78419 
1242*ce8eb11aSdp78419 static int
1243*ce8eb11aSdp78419 fix_interleave(void)
1244*ce8eb11aSdp78419 {
1245*ce8eb11aSdp78419 	int i, j;
1246*ce8eb11aSdp78419 	uint64_t mask = 0;
1247*ce8eb11aSdp78419 
1248*ce8eb11aSdp78419 	j = 0;
1249*ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
1250*ce8eb11aSdp78419 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1251*ce8eb11aSdp78419 			/* remove this lgroup */
1252*ce8eb11aSdp78419 			mask = mpo_lgroup[i].addr_mask;
1253*ce8eb11aSdp78419 		} else {
1254*ce8eb11aSdp78419 			mpo_lgroup[j++] = mpo_lgroup[i];
1255*ce8eb11aSdp78419 		}
1256*ce8eb11aSdp78419 	}
1257*ce8eb11aSdp78419 	n_lgrpnodes = j;
1258*ce8eb11aSdp78419 
1259*ce8eb11aSdp78419 	if (mask != 0)
1260*ce8eb11aSdp78419 		MPO_STATUS("sub-page interleave %lx found; "
1261*ce8eb11aSdp78419 		    "removing lgroup.\n", mask);
1262*ce8eb11aSdp78419 
1263*ce8eb11aSdp78419 	return (mask != 0);
1264*ce8eb11aSdp78419 }
1265