xref: /titanic_54/usr/src/uts/sun4v/os/mpo.c (revision bb57d1f5164aca913cbd286ae1b61c896167cfa7)
1ce8eb11aSdp78419 /*
2ce8eb11aSdp78419  * CDDL HEADER START
3ce8eb11aSdp78419  *
4ce8eb11aSdp78419  * The contents of this file are subject to the terms of the
5ce8eb11aSdp78419  * Common Development and Distribution License (the "License").
6ce8eb11aSdp78419  * You may not use this file except in compliance with the License.
7ce8eb11aSdp78419  *
8ce8eb11aSdp78419  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9ce8eb11aSdp78419  * or http://www.opensolaris.org/os/licensing.
10ce8eb11aSdp78419  * See the License for the specific language governing permissions
11ce8eb11aSdp78419  * and limitations under the License.
12ce8eb11aSdp78419  *
13ce8eb11aSdp78419  * When distributing Covered Code, include this CDDL HEADER in each
14ce8eb11aSdp78419  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15ce8eb11aSdp78419  * If applicable, add the following below this CDDL HEADER, with the
16ce8eb11aSdp78419  * fields enclosed by brackets "[]" replaced with your own identifying
17ce8eb11aSdp78419  * information: Portions Copyright [yyyy] [name of copyright owner]
18ce8eb11aSdp78419  *
19ce8eb11aSdp78419  * CDDL HEADER END
20ce8eb11aSdp78419  */
21ce8eb11aSdp78419 
22ce8eb11aSdp78419 /*
23ce8eb11aSdp78419  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24ce8eb11aSdp78419  * Use is subject to license terms.
25ce8eb11aSdp78419  */
26ce8eb11aSdp78419 
27ce8eb11aSdp78419 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28ce8eb11aSdp78419 
29ce8eb11aSdp78419 #include <sys/types.h>
30ce8eb11aSdp78419 #include <sys/sysmacros.h>
31ce8eb11aSdp78419 #include <sys/machsystm.h>
32ce8eb11aSdp78419 #include <sys/machparam.h>
33ce8eb11aSdp78419 #include <sys/cmn_err.h>
34ce8eb11aSdp78419 #include <sys/stat.h>
35ce8eb11aSdp78419 #include <sys/mach_descrip.h>
36ce8eb11aSdp78419 #include <sys/memnode.h>
37ce8eb11aSdp78419 #include <sys/mdesc.h>
38ce8eb11aSdp78419 #include <sys/mpo.h>
39ce8eb11aSdp78419 #include <vm/vm_dep.h>
40e853d8c3Sjc25722 #include <vm/hat_sfmmu.h>
41*bb57d1f5Sjc25722 #include <sys/promif.h>
42ce8eb11aSdp78419 
43ce8eb11aSdp78419 /*
44ce8eb11aSdp78419  * MPO and the sun4v memory representation
45ce8eb11aSdp78419  * ---------------------------------------
46ce8eb11aSdp78419  *
47ce8eb11aSdp78419  * Latency groups are defined in the sun4v achitecture by memory-latency-group
48ce8eb11aSdp78419  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
49ce8eb11aSdp78419  * tie together cpu nodes and mblock nodes, and contain mask and match
50ce8eb11aSdp78419  * properties that identify the portion of an mblock that belongs to the
51ce8eb11aSdp78419  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
52ce8eb11aSdp78419  * but an mblock defines Real Addresses (RA).  To translate, the mblock
53ce8eb11aSdp78419  * includes the property address-congruence-offset, hereafter referred to as
54ce8eb11aSdp78419  * ra_to_pa.  A real address ra is a member of an lgroup if
55ce8eb11aSdp78419  *
56ce8eb11aSdp78419  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
57ce8eb11aSdp78419  *
58ce8eb11aSdp78419  * The MD is traversed, and information on all mblocks is kept in the array
59ce8eb11aSdp78419  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
60ce8eb11aSdp78419  * to, is kept in the array mpo_cpu[].
61ce8eb11aSdp78419  *
62ce8eb11aSdp78419  * This implementation makes (and verifies) the simplifying assumption that
63ce8eb11aSdp78419  * the mask bits are the same for all defined lgroups, and that all 1 bits in
64ce8eb11aSdp78419  * the mask are contiguous.  Thus the number of lgroups is bounded by the
65ce8eb11aSdp78419  * number of possible mask values, and the lgrp_handle_t is defined as the
66ce8eb11aSdp78419  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
67ce8eb11aSdp78419  * masks and values are also referred to as "home bits" in the code.
68ce8eb11aSdp78419  *
69ce8eb11aSdp78419  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
70ce8eb11aSdp78419  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
71ce8eb11aSdp78419  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
72ce8eb11aSdp78419  * home bits.  This yields the mem_node.
73ce8eb11aSdp78419  *
74ce8eb11aSdp78419  * Interfaces
75ce8eb11aSdp78419  * ----------
76ce8eb11aSdp78419  *
77ce8eb11aSdp78419  * This file exports the following entry points:
78ce8eb11aSdp78419  *
79ce8eb11aSdp78419  * plat_lgrp_init()
80ce8eb11aSdp78419  * plat_build_mem_nodes()
81ce8eb11aSdp78419  * plat_lgrp_cpu_to_hand()
82ce8eb11aSdp78419  * plat_lgrp_latency()
83ce8eb11aSdp78419  * plat_pfn_to_mem_node()
84ce8eb11aSdp78419  *	These implement the usual platform lgroup interfaces.
85ce8eb11aSdp78419  *
86ce8eb11aSdp78419  * plat_rapfn_to_papfn()
87ce8eb11aSdp78419  *	Recover the PA page coloring bits from an RA.
88ce8eb11aSdp78419  *
89ce8eb11aSdp78419  * plat_mem_node_iterator_init()
90ce8eb11aSdp78419  *	Initialize an iterator to efficiently step through pages in a mem_node.
91ce8eb11aSdp78419  *
92ce8eb11aSdp78419  * plat_mem_node_intersect_range()
93ce8eb11aSdp78419  *	Find the intersection with a mem_node.
94ce8eb11aSdp78419  */
95ce8eb11aSdp78419 
96ce8eb11aSdp78419 int	sun4v_mpo_enable = 1;
97ce8eb11aSdp78419 int	sun4v_mpo_debug = 0;
98ce8eb11aSdp78419 char	sun4v_mpo_status[256] = "";
99ce8eb11aSdp78419 
100ce8eb11aSdp78419 /* Save CPU info from the MD and associate CPUs with lgroups */
101ce8eb11aSdp78419 static	struct cpu_md mpo_cpu[NCPU];
102ce8eb11aSdp78419 
103ce8eb11aSdp78419 /* Save lgroup info from the MD */
104ce8eb11aSdp78419 #define	MAX_MD_LGROUPS 32
105ce8eb11aSdp78419 static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
106ce8eb11aSdp78419 static	int	n_lgrpnodes = 0;
107ce8eb11aSdp78419 static	int	n_locality_groups = 0;
108ce8eb11aSdp78419 static	int	max_locality_groups = 0;
109ce8eb11aSdp78419 
110ce8eb11aSdp78419 /* Save mblocks from the MD */
111*bb57d1f5Sjc25722 #define	SMALL_MBLOCKS_COUNT	8
112*bb57d1f5Sjc25722 static 	struct	mblock_md *mpo_mblock;
113*bb57d1f5Sjc25722 static	struct 	mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
114ce8eb11aSdp78419 static	int	n_mblocks = 0;
115ce8eb11aSdp78419 
116ce8eb11aSdp78419 /* Save mem_node stripes calculate from mblocks and lgroups. */
117*bb57d1f5Sjc25722 static mem_stripe_t *mem_stripes;
118*bb57d1f5Sjc25722 static	mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
119*bb57d1f5Sjc25722 static	int 	mstripesz = 0;
120ce8eb11aSdp78419 static	int	n_mem_stripes = 0;
121ce8eb11aSdp78419 static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
122ce8eb11aSdp78419 static	int	stripe_shift;	/* stride/stripes expressed as a shift */
123ce8eb11aSdp78419 static	pfn_t	mnode_pages;	/* mem_node stripe width */
124ce8eb11aSdp78419 
125ce8eb11aSdp78419 /* Save home mask and shift used to calculate lgrp_handle_t values */
126ce8eb11aSdp78419 static	uint64_t home_mask = 0;
127ce8eb11aSdp78419 static	pfn_t	home_mask_pfn = 0;
128ce8eb11aSdp78419 static	int	home_mask_shift = 0;
129ce8eb11aSdp78419 static	uint_t	home_mask_pfn_shift = 0;
130ce8eb11aSdp78419 
131ce8eb11aSdp78419 /* Save lowest and highest latencies found across all lgroups */
132ce8eb11aSdp78419 static	int	lower_latency = 0;
133ce8eb11aSdp78419 static	int	higher_latency = 0;
134ce8eb11aSdp78419 
135ce8eb11aSdp78419 static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
136ce8eb11aSdp78419 
137ce8eb11aSdp78419 static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
138ce8eb11aSdp78419 static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
139ce8eb11aSdp78419 static	int	fix_interleave(void);
140ce8eb11aSdp78419 
141ce8eb11aSdp78419 /* Debug support */
142ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint)
143ce8eb11aSdp78419 #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
144ce8eb11aSdp78419 #else
145ce8eb11aSdp78419 #define	MPO_DEBUG(...)
146ce8eb11aSdp78419 #endif	/* DEBUG */
147ce8eb11aSdp78419 
148ce8eb11aSdp78419 /* Record status message, viewable from mdb */
149ce8eb11aSdp78419 #define	MPO_STATUS(args...) {						      \
150ce8eb11aSdp78419 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
151ce8eb11aSdp78419 	MPO_DEBUG(sun4v_mpo_status);					      \
152ce8eb11aSdp78419 }
153ce8eb11aSdp78419 
154ce8eb11aSdp78419 /*
155ce8eb11aSdp78419  * Routine to read a uint64_t from a given md
156ce8eb11aSdp78419  */
157ce8eb11aSdp78419 static	int64_t
158ce8eb11aSdp78419 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
159ce8eb11aSdp78419 {
160ce8eb11aSdp78419 	int err = md_get_prop_val(md, node, propname, val);
161ce8eb11aSdp78419 	return (err);
162ce8eb11aSdp78419 }
163ce8eb11aSdp78419 
164ce8eb11aSdp78419 static int
165ce8eb11aSdp78419 mblock_cmp(const void *a, const void *b)
166ce8eb11aSdp78419 {
167ce8eb11aSdp78419 	struct mblock_md *m1 = (struct mblock_md *)a;
168ce8eb11aSdp78419 	struct mblock_md *m2 = (struct mblock_md *)b;
169ce8eb11aSdp78419 
170ce8eb11aSdp78419 	if (m1->base < m2->base)
171ce8eb11aSdp78419 		return (-1);
172ce8eb11aSdp78419 	else if (m1->base == m2->base)
173ce8eb11aSdp78419 		return (0);
174ce8eb11aSdp78419 	else
175ce8eb11aSdp78419 		return (1);
176ce8eb11aSdp78419 }
177ce8eb11aSdp78419 
178ce8eb11aSdp78419 static void
179ce8eb11aSdp78419 mblock_sort(struct mblock_md *mblocks, int n)
180ce8eb11aSdp78419 {
181ce8eb11aSdp78419 	extern void qsort(void *, size_t, size_t,
182ce8eb11aSdp78419 	    int (*)(const void *, const void *));
183ce8eb11aSdp78419 
184ce8eb11aSdp78419 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
185ce8eb11aSdp78419 }
186ce8eb11aSdp78419 
187ce8eb11aSdp78419 /*
188ce8eb11aSdp78419  *
189ce8eb11aSdp78419  * Traverse the MD to determine:
190ce8eb11aSdp78419  *
191ce8eb11aSdp78419  *  Number of CPU nodes, lgrp_nodes, and mblocks
192ce8eb11aSdp78419  *  Then for each lgrp_node, obtain the appropriate data.
193ce8eb11aSdp78419  *  For each CPU, determine its home locality and store it.
194ce8eb11aSdp78419  *  For each mblock, retrieve its data and store it.
195ce8eb11aSdp78419  */
196ce8eb11aSdp78419 static	int
197ce8eb11aSdp78419 lgrp_traverse(md_t *md)
198ce8eb11aSdp78419 {
199ce8eb11aSdp78419 	mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
200ce8eb11aSdp78419 	uint64_t i, j, k, o, n_nodes;
201ce8eb11aSdp78419 	uint64_t n_lgroups = 0;
202ce8eb11aSdp78419 	uint64_t mem_lg_homeset = 0;
203ce8eb11aSdp78419 	int ret_val = 0;
204ce8eb11aSdp78419 	int result = 0;
205ce8eb11aSdp78419 	int n_cpunodes = 0;
206ce8eb11aSdp78419 	int sub_page_fix;
207*bb57d1f5Sjc25722 	int mblocksz = 0;
208*bb57d1f5Sjc25722 	size_t allocsz;
209ce8eb11aSdp78419 
210ce8eb11aSdp78419 	n_nodes = md_node_count(md);
211ce8eb11aSdp78419 
212ce8eb11aSdp78419 	if (n_nodes <= 0) {
213ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No nodes in node count\n");
214ce8eb11aSdp78419 		ret_val = -1;
215ce8eb11aSdp78419 		goto fail;
216ce8eb11aSdp78419 	}
217ce8eb11aSdp78419 
218ce8eb11aSdp78419 	root = md_root_node(md);
219ce8eb11aSdp78419 
220ce8eb11aSdp78419 	if (root == MDE_INVAL_ELEM_COOKIE) {
221ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: Root node is missing\n");
222ce8eb11aSdp78419 		ret_val = -1;
223ce8eb11aSdp78419 		goto fail;
224ce8eb11aSdp78419 	}
225ce8eb11aSdp78419 
226ce8eb11aSdp78419 	/*
227ce8eb11aSdp78419 	 * Build the Memory Nodes.  Do this before any possibility of
228ce8eb11aSdp78419 	 * bailing from this routine so we obtain ra_to_pa (needed for page
229ce8eb11aSdp78419 	 * coloring) even when there are no lgroups defined.
230ce8eb11aSdp78419 	 */
231ce8eb11aSdp78419 
232ce8eb11aSdp78419 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
233ce8eb11aSdp78419 	    "fwd", &mblocknodes);
234ce8eb11aSdp78419 
235*bb57d1f5Sjc25722 	if (n_mblocks <= 0) {
236ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No mblock "
237ce8eb11aSdp78419 		    "nodes detected in Machine Descriptor\n");
238ce8eb11aSdp78419 		n_mblocks = 0;
239ce8eb11aSdp78419 		ret_val = -1;
240ce8eb11aSdp78419 		goto fail;
241ce8eb11aSdp78419 	}
242*bb57d1f5Sjc25722 	/*
243*bb57d1f5Sjc25722 	 * If we have a small number of mblocks we will use the space
244*bb57d1f5Sjc25722 	 * that we preallocated. Otherwise, we will dynamically
245*bb57d1f5Sjc25722 	 * allocate the space
246*bb57d1f5Sjc25722 	 */
247*bb57d1f5Sjc25722 	mblocksz = n_mblocks * sizeof (struct mblock_md);
248*bb57d1f5Sjc25722 	mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t);
249*bb57d1f5Sjc25722 
250*bb57d1f5Sjc25722 	if (n_mblocks <= SMALL_MBLOCKS_COUNT) {
251*bb57d1f5Sjc25722 		mpo_mblock = &small_mpo_mblocks[0];
252*bb57d1f5Sjc25722 		mem_stripes = &small_mem_stripes[0];
253*bb57d1f5Sjc25722 	} else {
254*bb57d1f5Sjc25722 		allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
255*bb57d1f5Sjc25722 		/* Ensure that we dont request more space than reserved */
256*bb57d1f5Sjc25722 		if (allocsz > MPOBUF_SIZE) {
257*bb57d1f5Sjc25722 			MPO_STATUS("lgrp_traverse: Insufficient space "
258*bb57d1f5Sjc25722 			    "for mblock structures \n");
259*bb57d1f5Sjc25722 			ret_val = -1;
260*bb57d1f5Sjc25722 			n_mblocks = 0;
261*bb57d1f5Sjc25722 			goto fail;
262*bb57d1f5Sjc25722 		}
263*bb57d1f5Sjc25722 		mpo_mblock = (struct mblock_md *)
264*bb57d1f5Sjc25722 		    prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
265*bb57d1f5Sjc25722 		if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) {
266*bb57d1f5Sjc25722 			MPO_STATUS("lgrp_traverse: Cannot allocate space "
267*bb57d1f5Sjc25722 			    "for mblocks \n");
268*bb57d1f5Sjc25722 			ret_val = -1;
269*bb57d1f5Sjc25722 			n_mblocks = 0;
270*bb57d1f5Sjc25722 			goto fail;
271*bb57d1f5Sjc25722 		}
272*bb57d1f5Sjc25722 		mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
273*bb57d1f5Sjc25722 		mpo_heap32_bufsz = MPOBUF_SIZE;
274*bb57d1f5Sjc25722 
275*bb57d1f5Sjc25722 		mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks);
276*bb57d1f5Sjc25722 	}
277ce8eb11aSdp78419 
278ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
279ce8eb11aSdp78419 		mpo_mblock[i].node = mblocknodes[i];
280*bb57d1f5Sjc25722 		mpo_mblock[i].mnode_mask = (mnodeset_t)0;
281ce8eb11aSdp78419 
282ce8eb11aSdp78419 		/* Without a base or size value we will fail */
283ce8eb11aSdp78419 		result = get_int(md, mblocknodes[i], PROP_LG_BASE,
284ce8eb11aSdp78419 		    &mpo_mblock[i].base);
285ce8eb11aSdp78419 		if (result < 0) {
286ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: "
287ce8eb11aSdp78419 			    "PROP_LG_BASE is missing\n");
288ce8eb11aSdp78419 			n_mblocks = 0;
289ce8eb11aSdp78419 			ret_val = -1;
290ce8eb11aSdp78419 			goto fail;
291ce8eb11aSdp78419 		}
292ce8eb11aSdp78419 
293ce8eb11aSdp78419 		result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
294ce8eb11aSdp78419 		    &mpo_mblock[i].size);
295ce8eb11aSdp78419 		if (result < 0) {
296ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: "
297ce8eb11aSdp78419 			    "PROP_LG_SIZE is missing\n");
298ce8eb11aSdp78419 			n_mblocks = 0;
299ce8eb11aSdp78419 			ret_val = -1;
300ce8eb11aSdp78419 			goto fail;
301ce8eb11aSdp78419 		}
302ce8eb11aSdp78419 
303ce8eb11aSdp78419 		result = get_int(md, mblocknodes[i],
304ce8eb11aSdp78419 		    PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
305ce8eb11aSdp78419 
306ce8eb11aSdp78419 		/* If we don't have an ra_pa_offset, just set it to 0 */
307ce8eb11aSdp78419 		if (result < 0)
308ce8eb11aSdp78419 			mpo_mblock[i].ra_to_pa = 0;
309ce8eb11aSdp78419 
310ce8eb11aSdp78419 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
311ce8eb11aSdp78419 		    "ra_to_pa = %lx\n", i,
312ce8eb11aSdp78419 		    mpo_mblock[i].base,
313ce8eb11aSdp78419 		    mpo_mblock[i].size,
314ce8eb11aSdp78419 		    mpo_mblock[i].ra_to_pa);
315ce8eb11aSdp78419 	}
316ce8eb11aSdp78419 
317ce8eb11aSdp78419 	/* Must sort mblocks by address for mem_node_iterator_init() */
318ce8eb11aSdp78419 	mblock_sort(mpo_mblock, n_mblocks);
319ce8eb11aSdp78419 
320ce8eb11aSdp78419 	base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
321ce8eb11aSdp78419 
322ce8eb11aSdp78419 	/* Page coloring hook is required so we can iterate through mnodes */
323ce8eb11aSdp78419 	if (&page_next_pfn_for_color_cpu == NULL) {
324ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
325ce8eb11aSdp78419 		ret_val = -1;
326ce8eb11aSdp78419 		goto fail;
327ce8eb11aSdp78419 	}
328ce8eb11aSdp78419 
329ce8eb11aSdp78419 	/* Global enable for mpo */
330ce8eb11aSdp78419 	if (sun4v_mpo_enable == 0) {
331ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
332ce8eb11aSdp78419 		ret_val = -1;
333ce8eb11aSdp78419 		goto fail;
334ce8eb11aSdp78419 	}
335ce8eb11aSdp78419 
336ce8eb11aSdp78419 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
337ce8eb11aSdp78419 	    "fwd", &lgrpnodes);
338ce8eb11aSdp78419 
339ce8eb11aSdp78419 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
340ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No Lgroups\n");
341ce8eb11aSdp78419 		ret_val = -1;
342ce8eb11aSdp78419 		goto fail;
343ce8eb11aSdp78419 	}
344ce8eb11aSdp78419 
345ce8eb11aSdp78419 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
346ce8eb11aSdp78419 
347ce8eb11aSdp78419 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
348ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
349ce8eb11aSdp78419 		    "in MD\n");
350ce8eb11aSdp78419 		ret_val = -1;
351ce8eb11aSdp78419 		goto fail;
352ce8eb11aSdp78419 	}
353ce8eb11aSdp78419 
354ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
355ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: md: %p\n", md);
356ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
357ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
358ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
359ce8eb11aSdp78419 	MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
360ce8eb11aSdp78419 
361ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
362ce8eb11aSdp78419 		mpo_lgroup[i].node = lgrpnodes[i];
363ce8eb11aSdp78419 		mpo_lgroup[i].id = i;
364ce8eb11aSdp78419 		mpo_lgroup[i].ncpu = 0;
365ce8eb11aSdp78419 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
366ce8eb11aSdp78419 		    &mpo_lgroup[i].addr_mask);
367ce8eb11aSdp78419 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
368ce8eb11aSdp78419 		    &mpo_lgroup[i].addr_match);
369ce8eb11aSdp78419 
370ce8eb11aSdp78419 		/*
371ce8eb11aSdp78419 		 * If either the mask or match properties are missing, set to 0
372ce8eb11aSdp78419 		 */
373ce8eb11aSdp78419 		if (result < 0) {
374ce8eb11aSdp78419 			mpo_lgroup[i].addr_mask = 0;
375ce8eb11aSdp78419 			mpo_lgroup[i].addr_match = 0;
376ce8eb11aSdp78419 		}
377ce8eb11aSdp78419 
378ce8eb11aSdp78419 		/* Set latency to 0 if property not present */
379ce8eb11aSdp78419 
380ce8eb11aSdp78419 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
381ce8eb11aSdp78419 		    &mpo_lgroup[i].latency);
382ce8eb11aSdp78419 		if (result < 0)
383ce8eb11aSdp78419 			mpo_lgroup[i].latency = 0;
384ce8eb11aSdp78419 	}
385ce8eb11aSdp78419 
386ce8eb11aSdp78419 	/*
387ce8eb11aSdp78419 	 * Sub-page level interleave is not yet supported.  Check for it,
388ce8eb11aSdp78419 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
389ce8eb11aSdp78419 	 * n_lgrpnodes.  If no lgroups are left, return.
390ce8eb11aSdp78419 	 */
391ce8eb11aSdp78419 
392ce8eb11aSdp78419 	sub_page_fix = fix_interleave();
393ce8eb11aSdp78419 	if (n_lgrpnodes == 0) {
394ce8eb11aSdp78419 		ret_val = -1;
395ce8eb11aSdp78419 		goto fail;
396ce8eb11aSdp78419 	}
397ce8eb11aSdp78419 
398ce8eb11aSdp78419 	/* Ensure that all of the addr_mask values are the same */
399ce8eb11aSdp78419 
400ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
401ce8eb11aSdp78419 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
402ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: "
403ce8eb11aSdp78419 			    "addr_mask values are not the same\n");
404ce8eb11aSdp78419 			ret_val = -1;
405ce8eb11aSdp78419 			goto fail;
406ce8eb11aSdp78419 		}
407ce8eb11aSdp78419 	}
408ce8eb11aSdp78419 
409ce8eb11aSdp78419 	/*
410ce8eb11aSdp78419 	 * Ensure that all lgrp nodes see all the mblocks. However, if
411ce8eb11aSdp78419 	 * sub-page interleave is being fixed, they do not, so skip
412ce8eb11aSdp78419 	 * the check.
413ce8eb11aSdp78419 	 */
414ce8eb11aSdp78419 
415ce8eb11aSdp78419 	if (sub_page_fix == 0) {
416ce8eb11aSdp78419 		for (i = 0; i < n_lgrpnodes; i++) {
417ce8eb11aSdp78419 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
418ce8eb11aSdp78419 			    PROP_LG_MBLOCK, "fwd", &nodes);
419ce8eb11aSdp78419 			md_free_scan_dag(md, &nodes);
420ce8eb11aSdp78419 			if (j != n_mblocks) {
421ce8eb11aSdp78419 				MPO_STATUS("lgrp_traverse: "
422ce8eb11aSdp78419 				    "sub-page interleave is being fixed\n");
423ce8eb11aSdp78419 				ret_val = -1;
424ce8eb11aSdp78419 				goto fail;
425ce8eb11aSdp78419 			}
426ce8eb11aSdp78419 		}
427ce8eb11aSdp78419 	}
428ce8eb11aSdp78419 
429ce8eb11aSdp78419 	/*
430ce8eb11aSdp78419 	 * Use the address mask from the first lgroup node
431ce8eb11aSdp78419 	 * to establish our home_mask.
432ce8eb11aSdp78419 	 */
433ce8eb11aSdp78419 	home_mask = mpo_lgroup[0].addr_mask;
434ce8eb11aSdp78419 	home_mask_pfn = btop(home_mask);
435ce8eb11aSdp78419 	home_mask_shift = lowbit(home_mask) - 1;
436ce8eb11aSdp78419 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
437ce8eb11aSdp78419 	mnode_pages = btop(1ULL << home_mask_shift);
438ce8eb11aSdp78419 
439ce8eb11aSdp78419 	/*
440ce8eb11aSdp78419 	 * How many values are possible in home mask?  Assume the mask
441ce8eb11aSdp78419 	 * bits are contiguous.
442ce8eb11aSdp78419 	 */
443ce8eb11aSdp78419 	max_locality_groups =
444ce8eb11aSdp78419 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
445ce8eb11aSdp78419 
446ce8eb11aSdp78419 	/* Now verify the home mask bits are contiguous */
447ce8eb11aSdp78419 
448ce8eb11aSdp78419 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
449ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: "
450ce8eb11aSdp78419 		    "home mask bits are not contiguous\n");
451ce8eb11aSdp78419 		ret_val = -1;
452ce8eb11aSdp78419 		goto fail;
453ce8eb11aSdp78419 	}
454ce8eb11aSdp78419 
455ce8eb11aSdp78419 	/* Record all of the home bits */
456ce8eb11aSdp78419 
457ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
458ce8eb11aSdp78419 		HOMESET_ADD(mem_lg_homeset,
459ce8eb11aSdp78419 		    mpo_lgroup[i].addr_match >> home_mask_shift);
460ce8eb11aSdp78419 	}
461ce8eb11aSdp78419 
462ce8eb11aSdp78419 	/* Count the number different "home"  mem_lg's we've discovered */
463ce8eb11aSdp78419 
464ce8eb11aSdp78419 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
465ce8eb11aSdp78419 
466ce8eb11aSdp78419 	/* If we have only 1 locality group then we can exit */
467ce8eb11aSdp78419 	if (n_locality_groups == 1) {
468ce8eb11aSdp78419 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
469ce8eb11aSdp78419 		ret_val = -1;
470ce8eb11aSdp78419 		goto fail;
471ce8eb11aSdp78419 	}
472ce8eb11aSdp78419 
473ce8eb11aSdp78419 	/*
474ce8eb11aSdp78419 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
475ce8eb11aSdp78419 	 * latency found.  All other memory is considered remote, and the
476ce8eb11aSdp78419 	 * remote latency is represented by the highest latency found.
477ce8eb11aSdp78419 	 * Thus hierarchical lgroups, if any, are approximated by a
478ce8eb11aSdp78419 	 * two level scheme.
479ce8eb11aSdp78419 	 *
480ce8eb11aSdp78419 	 * The Solaris MPO framework by convention wants to see latencies
481ce8eb11aSdp78419 	 * in units of nano-sec/10. In the MD, the units are defined to be
482ce8eb11aSdp78419 	 * pico-seconds.
483ce8eb11aSdp78419 	 */
484ce8eb11aSdp78419 
485ce8eb11aSdp78419 	lower_latency = mpo_lgroup[0].latency;
486ce8eb11aSdp78419 	higher_latency = mpo_lgroup[0].latency;
487ce8eb11aSdp78419 
488ce8eb11aSdp78419 	for (i = 1; i < n_lgrpnodes; i++) {
489ce8eb11aSdp78419 		if (mpo_lgroup[i].latency < lower_latency) {
490ce8eb11aSdp78419 			lower_latency = mpo_lgroup[i].latency;
491ce8eb11aSdp78419 		}
492ce8eb11aSdp78419 		if (mpo_lgroup[i].latency > higher_latency) {
493ce8eb11aSdp78419 			higher_latency = mpo_lgroup[i].latency;
494ce8eb11aSdp78419 		}
495ce8eb11aSdp78419 	}
496ce8eb11aSdp78419 	lower_latency /= 10000;
497ce8eb11aSdp78419 	higher_latency /= 10000;
498ce8eb11aSdp78419 
499ce8eb11aSdp78419 	/* Clear our CPU data */
500ce8eb11aSdp78419 
501ce8eb11aSdp78419 	for (i = 0; i < NCPU; i++) {
502ce8eb11aSdp78419 		mpo_cpu[i].home = 0;
503ce8eb11aSdp78419 		mpo_cpu[i].latency = (uint_t)(-1);
504ce8eb11aSdp78419 	}
505ce8eb11aSdp78419 
506ce8eb11aSdp78419 	/* Build the CPU nodes */
507ce8eb11aSdp78419 	for (i = 0; i < n_cpunodes; i++) {
508ce8eb11aSdp78419 
509ce8eb11aSdp78419 		/* Read in the lgroup nodes */
510ce8eb11aSdp78419 
511ce8eb11aSdp78419 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
512ce8eb11aSdp78419 		if (result < 0) {
513ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
514ce8eb11aSdp78419 			ret_val = -1;
515ce8eb11aSdp78419 			goto fail;
516ce8eb11aSdp78419 		}
517ce8eb11aSdp78419 
518ce8eb11aSdp78419 		n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
519ce8eb11aSdp78419 		    "fwd", &nodes);
520ce8eb11aSdp78419 		if (n_lgroups <= 0) {
521ce8eb11aSdp78419 			MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
522ce8eb11aSdp78419 			ret_val = -1;
523ce8eb11aSdp78419 			goto fail;
524ce8eb11aSdp78419 		}
525ce8eb11aSdp78419 
526ce8eb11aSdp78419 		/*
527ce8eb11aSdp78419 		 * Find the lgroup this cpu belongs to with the lowest latency.
528ce8eb11aSdp78419 		 * Check all the lgrp nodes connected to this CPU to determine
529ce8eb11aSdp78419 		 * which has the smallest latency.
530ce8eb11aSdp78419 		 */
531ce8eb11aSdp78419 
532ce8eb11aSdp78419 		for (j = 0; j < n_lgroups; j++) {
533ce8eb11aSdp78419 			for (o = 0; o < n_lgrpnodes; o++) {
534ce8eb11aSdp78419 				if (nodes[j] == mpo_lgroup[o].node) {
535ce8eb11aSdp78419 					if (mpo_lgroup[o].latency <
536ce8eb11aSdp78419 					    mpo_cpu[k].latency) {
537ce8eb11aSdp78419 						mpo_cpu[k].home =
538ce8eb11aSdp78419 						    mpo_lgroup[o].addr_match
539ce8eb11aSdp78419 						    >> home_mask_shift;
540ce8eb11aSdp78419 						mpo_cpu[k].latency =
541ce8eb11aSdp78419 						    mpo_lgroup[o].latency;
542ce8eb11aSdp78419 						mpo_lgroup[o].ncpu++;
543ce8eb11aSdp78419 					}
544ce8eb11aSdp78419 				}
545ce8eb11aSdp78419 			}
546ce8eb11aSdp78419 		}
547ce8eb11aSdp78419 		md_free_scan_dag(md, &nodes);
548ce8eb11aSdp78419 	}
549ce8eb11aSdp78419 
550ce8eb11aSdp78419 	/* Validate that no large pages cross mnode boundaries. */
551ce8eb11aSdp78419 	if (valid_pages(md, cpunodes[0]) == 0) {
552ce8eb11aSdp78419 		ret_val = -1;
553ce8eb11aSdp78419 		goto fail;
554ce8eb11aSdp78419 	}
555ce8eb11aSdp78419 
556ce8eb11aSdp78419 fail:
557ce8eb11aSdp78419 	/* MD cookies are no longer valid; ensure they are not used again. */
558ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++)
559ce8eb11aSdp78419 		mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
560ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++)
561ce8eb11aSdp78419 		mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
562ce8eb11aSdp78419 
563ce8eb11aSdp78419 	if (n_cpunodes > 0)
564ce8eb11aSdp78419 		md_free_scan_dag(md, &cpunodes);
565ce8eb11aSdp78419 	if (n_lgrpnodes > 0)
566ce8eb11aSdp78419 		md_free_scan_dag(md, &lgrpnodes);
567ce8eb11aSdp78419 	if (n_mblocks > 0)
568ce8eb11aSdp78419 		md_free_scan_dag(md, &mblocknodes);
569ce8eb11aSdp78419 	else
570ce8eb11aSdp78419 		panic("lgrp_traverse: No memory blocks found");
571ce8eb11aSdp78419 
572ce8eb11aSdp78419 	if (ret_val == 0)
573ce8eb11aSdp78419 		MPO_STATUS("MPO feature is enabled.\n");
574ce8eb11aSdp78419 
575ce8eb11aSdp78419 	return (ret_val);
576ce8eb11aSdp78419 }
577ce8eb11aSdp78419 
578ce8eb11aSdp78419 /*
579ce8eb11aSdp78419  *  Determine the number of unique mem_lg's present in our system
580ce8eb11aSdp78419  */
581ce8eb11aSdp78419 static	int
582ce8eb11aSdp78419 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
583ce8eb11aSdp78419 {
584ce8eb11aSdp78419 	int homeid;
585ce8eb11aSdp78419 	int count = 0;
586ce8eb11aSdp78419 
587ce8eb11aSdp78419 	/*
588ce8eb11aSdp78419 	 * Scan the "home" bits of the mem_lgs, count
589ce8eb11aSdp78419 	 * the number that are unique.
590ce8eb11aSdp78419 	 */
591ce8eb11aSdp78419 
592ce8eb11aSdp78419 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
593ce8eb11aSdp78419 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
594ce8eb11aSdp78419 			count++;
595ce8eb11aSdp78419 		}
596ce8eb11aSdp78419 	}
597ce8eb11aSdp78419 
598ce8eb11aSdp78419 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
599ce8eb11aSdp78419 	    mem_lg_homeset);
600ce8eb11aSdp78419 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
601ce8eb11aSdp78419 
602ce8eb11aSdp78419 	/* Default must be at least one */
603ce8eb11aSdp78419 	if (count == 0)
604ce8eb11aSdp78419 		count = 1;
605ce8eb11aSdp78419 
606ce8eb11aSdp78419 	return (count);
607ce8eb11aSdp78419 }
608ce8eb11aSdp78419 
609ce8eb11aSdp78419 /*
610ce8eb11aSdp78419  * Platform specific lgroup initialization
611ce8eb11aSdp78419  */
612ce8eb11aSdp78419 void
613ce8eb11aSdp78419 plat_lgrp_init(void)
614ce8eb11aSdp78419 {
615ce8eb11aSdp78419 	md_t *md;
616ce8eb11aSdp78419 	int i, rc, ncpu_min;
617ce8eb11aSdp78419 
618ce8eb11aSdp78419 	/* Get the Machine Descriptor handle */
619ce8eb11aSdp78419 
620ce8eb11aSdp78419 	md = md_get_handle();
621ce8eb11aSdp78419 
622ce8eb11aSdp78419 	/* If not, we cannot continue */
623ce8eb11aSdp78419 
624ce8eb11aSdp78419 	if (md == NULL) {
625ce8eb11aSdp78419 		panic("cannot access machine descriptor\n");
626ce8eb11aSdp78419 	} else {
627ce8eb11aSdp78419 		rc = lgrp_traverse(md);
628ce8eb11aSdp78419 		(void) md_fini_handle(md);
629ce8eb11aSdp78419 	}
630ce8eb11aSdp78419 
631ce8eb11aSdp78419 	/*
632ce8eb11aSdp78419 	 * If we can't process the MD for lgroups then at least let the
633ce8eb11aSdp78419 	 * system try to boot.  Assume we have one lgroup so that
634ce8eb11aSdp78419 	 * when plat_build_mem_nodes is called, it will attempt to init
635ce8eb11aSdp78419 	 * an mnode based on the supplied memory segment.
636ce8eb11aSdp78419 	 */
637ce8eb11aSdp78419 
638ce8eb11aSdp78419 	if (rc == -1) {
639ce8eb11aSdp78419 		home_mask_pfn = 0;
640ce8eb11aSdp78419 		max_locality_groups = 1;
641ce8eb11aSdp78419 		n_locality_groups = 1;
642ce8eb11aSdp78419 		return;
643ce8eb11aSdp78419 	}
644ce8eb11aSdp78419 
645ce8eb11aSdp78419 	mem_node_pfn_shift = 0;
646ce8eb11aSdp78419 	mem_node_physalign = 0;
647ce8eb11aSdp78419 
648ce8eb11aSdp78419 	/* Use lgroup-aware TSB allocations */
649ce8eb11aSdp78419 	tsb_lgrp_affinity = 1;
650ce8eb11aSdp78419 
651ce8eb11aSdp78419 	/*
652ce8eb11aSdp78419 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
653ce8eb11aSdp78419 	 * this process is currently running on before considering
654ce8eb11aSdp78419 	 * expanding threads to another lgroup.
655ce8eb11aSdp78419 	 *
656ce8eb11aSdp78419 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
657ce8eb11aSdp78419 	 * must be loaded before expanding to it.
658ce8eb11aSdp78419 	 *
659ce8eb11aSdp78419 	 * On sun4v CMT processors, threads share a core pipeline, and
660ce8eb11aSdp78419 	 * at less than 100% utilization, best throughput is obtained by
661ce8eb11aSdp78419 	 * spreading threads across more cores, even if some are in a
662ce8eb11aSdp78419 	 * different lgroup.  Spread threads to a new lgroup if the
663ce8eb11aSdp78419 	 * current group is more than 50% loaded.  Because of virtualization,
664ce8eb11aSdp78419 	 * lgroups may have different numbers of CPUs, but the tunables
665ce8eb11aSdp78419 	 * apply to all lgroups, so find the smallest lgroup and compute
666ce8eb11aSdp78419 	 * 50% loading.
667ce8eb11aSdp78419 	 */
668ce8eb11aSdp78419 
669ce8eb11aSdp78419 	ncpu_min = NCPU;
670ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
671ce8eb11aSdp78419 		int ncpu = mpo_lgroup[i].ncpu;
672ce8eb11aSdp78419 		if (ncpu != 0 && ncpu < ncpu_min)
673ce8eb11aSdp78419 			ncpu_min = ncpu;
674ce8eb11aSdp78419 	}
675ce8eb11aSdp78419 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
676ce8eb11aSdp78419 
677ce8eb11aSdp78419 	/* new home may only be half as loaded as the existing home to use it */
678ce8eb11aSdp78419 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
679ce8eb11aSdp78419 
680ce8eb11aSdp78419 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
681ce8eb11aSdp78419 
682ce8eb11aSdp78419 	/* Require that a home lgroup have some memory to be chosen */
683ce8eb11aSdp78419 	lgrp_mem_free_thresh = 1;
684ce8eb11aSdp78419 
685ce8eb11aSdp78419 	/* Standard home-on-next-touch policy */
686ce8eb11aSdp78419 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
687ce8eb11aSdp78419 
688ce8eb11aSdp78419 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
689ce8eb11aSdp78419 	lgrp_load_thresh = UINT32_MAX;
690ce8eb11aSdp78419 }
691ce8eb11aSdp78419 
692ce8eb11aSdp78419 /*
693ce8eb11aSdp78419  *  Helper routine for debugging calls to mem_node_add_slice()
694ce8eb11aSdp78419  */
695ce8eb11aSdp78419 static	void
696ce8eb11aSdp78419 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
697ce8eb11aSdp78419 {
698ce8eb11aSdp78419 #if defined(DEBUG) && !defined(lint)
699ce8eb11aSdp78419 	static int slice_count = 0;
700ce8eb11aSdp78419 
701ce8eb11aSdp78419 	slice_count++;
702ce8eb11aSdp78419 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
703ce8eb11aSdp78419 	    slice_count, basepfn, endpfn);
704ce8eb11aSdp78419 #endif
705ce8eb11aSdp78419 	mem_node_add_slice(basepfn, endpfn);
706ce8eb11aSdp78419 }
707ce8eb11aSdp78419 
708ce8eb11aSdp78419 /*
709ce8eb11aSdp78419  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
710ce8eb11aSdp78419  */
711ce8eb11aSdp78419 static	void
712ce8eb11aSdp78419 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
713ce8eb11aSdp78419 {
714ce8eb11aSdp78419 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
715ce8eb11aSdp78419 	    "mnode index: %d\n", plathand, mnode);
716ce8eb11aSdp78419 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
717ce8eb11aSdp78419 }
718ce8eb11aSdp78419 
719ce8eb11aSdp78419 /*
720ce8eb11aSdp78419  * plat_build_mem_nodes()
721ce8eb11aSdp78419  *
722ce8eb11aSdp78419  * Define the mem_nodes based on the modified boot memory list,
723ce8eb11aSdp78419  * or based on info read from the MD in plat_lgrp_init().
724ce8eb11aSdp78419  *
725ce8eb11aSdp78419  * When the home mask lies in the middle of the address bits (as it does on
726ce8eb11aSdp78419  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
727ce8eb11aSdp78419  * it is striped across an mblock in a repeating pattern of contiguous memory
728ce8eb11aSdp78419  * followed by a gap.  The stripe width is the size of the contiguous piece.
729ce8eb11aSdp78419  * The stride is the distance from the start of one contiguous piece to the
730ce8eb11aSdp78419  * start of the next.  The gap is thus stride - stripe_width.
731ce8eb11aSdp78419  *
732ce8eb11aSdp78419  * The stripe of an mnode that falls within an mblock is described by the type
733ce8eb11aSdp78419  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
734ce8eb11aSdp78419  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
735ce8eb11aSdp78419  * this array is predetermined.  The mem_stripe_t that describes mnode m
736ce8eb11aSdp78419  * within mpo_mblock[i] is stored at
737ce8eb11aSdp78419  *	 mem_stripes[ m + i * max_locality_groups ]
738ce8eb11aSdp78419  *
739ce8eb11aSdp78419  * max_locality_groups is the total number of possible locality groups,
740ce8eb11aSdp78419  * as defined by the size of the home mask, even if the memory assigned
741ce8eb11aSdp78419  * to the domain is small and does not cover all the lgroups.  Thus some
742ce8eb11aSdp78419  * mem_stripe_t's may be empty.
743ce8eb11aSdp78419  *
744ce8eb11aSdp78419  * The members of mem_stripe_t are:
745ce8eb11aSdp78419  *	physbase: First valid page in mem_node in the corresponding mblock
746ce8eb11aSdp78419  *	physmax: Last valid page in mem_node in mblock
747ce8eb11aSdp78419  *	offset:  The full stripe width starts at physbase - offset.
748ce8eb11aSdp78419  *	    Thus if offset is non-zero, this mem_node starts in the middle
749ce8eb11aSdp78419  *	    of a stripe width, and the second full stripe starts at
750ce8eb11aSdp78419  *	    physbase - offset + stride.  (even though physmax may fall in the
751ce8eb11aSdp78419  *	    middle of a stripe width, we do not save the ending fragment size
752ce8eb11aSdp78419  *	    in this data structure.)
753ce8eb11aSdp78419  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
754ce8eb11aSdp78419  *
755ce8eb11aSdp78419  *	The stripe width is kept in the global mnode_pages.
756ce8eb11aSdp78419  *	The stride is kept in the global mnode_stride.
757ce8eb11aSdp78419  *	All the above use pfn's as the unit.
758ce8eb11aSdp78419  *
759ce8eb11aSdp78419  * As an example, the memory layout for a domain with 2 mblocks and 4
760ce8eb11aSdp78419  * mem_nodes 0,1,2,3 could look like this:
761ce8eb11aSdp78419  *
762ce8eb11aSdp78419  *	123012301230 ...	012301230123 ...
763ce8eb11aSdp78419  *	  mblock 0		  mblock 1
764ce8eb11aSdp78419  */
765ce8eb11aSdp78419 
766ce8eb11aSdp78419 void
767ce8eb11aSdp78419 plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
768ce8eb11aSdp78419 {
769ce8eb11aSdp78419 	lgrp_handle_t lgrphand, lgrp_start;
770ce8eb11aSdp78419 	int i, mnode, elem;
771ce8eb11aSdp78419 	uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
772ce8eb11aSdp78419 	uint64_t stripe, frag, remove;
773ce8eb11aSdp78419 	mem_stripe_t *ms;
774ce8eb11aSdp78419 
775e853d8c3Sjc25722 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
776e853d8c3Sjc25722 	max_mem_nodes = max_locality_groups;
777ce8eb11aSdp78419 
778e853d8c3Sjc25722 	/* Check for non-MPO sun4v platforms */
779ce8eb11aSdp78419 	if (n_locality_groups <= 1) {
780e853d8c3Sjc25722 		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
781ce8eb11aSdp78419 		for (elem = 0; elem < nelems; elem += 2) {
782ce8eb11aSdp78419 			base = list[elem];
783ce8eb11aSdp78419 			len = list[elem+1];
784ce8eb11aSdp78419 
785ce8eb11aSdp78419 			mpo_mem_node_add_slice(btop(base),
786ce8eb11aSdp78419 			    btop(base + len - 1));
787ce8eb11aSdp78419 		}
788ce8eb11aSdp78419 		mem_node_pfn_shift = 0;
789ce8eb11aSdp78419 		mem_node_physalign = 0;
790ce8eb11aSdp78419 		n_mem_stripes = 0;
791e853d8c3Sjc25722 		if (n_mblocks == 1)
792ce8eb11aSdp78419 			return;
793ce8eb11aSdp78419 	}
794ce8eb11aSdp78419 
795*bb57d1f5Sjc25722 	bzero(mem_stripes, mstripesz);
796ce8eb11aSdp78419 	stripe = ptob(mnode_pages);
797ce8eb11aSdp78419 	stride = max_locality_groups * stripe;
798ce8eb11aSdp78419 
799ce8eb11aSdp78419 	/* Save commonly used values in globals */
800ce8eb11aSdp78419 	mnode_stride = btop(stride);
801ce8eb11aSdp78419 	n_mem_stripes = max_locality_groups * n_mblocks;
802ce8eb11aSdp78419 	stripe_shift = highbit(max_locality_groups) - 1;
803ce8eb11aSdp78419 
804ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
805ce8eb11aSdp78419 
806ce8eb11aSdp78419 		base = mpo_mblock[i].base;
807ce8eb11aSdp78419 		end = mpo_mblock[i].base + mpo_mblock[i].size;
808ce8eb11aSdp78419 		ra_to_pa = mpo_mblock[i].ra_to_pa;
809ce8eb11aSdp78419 		mpo_mblock[i].base_pfn = btop(base);
810ce8eb11aSdp78419 		mpo_mblock[i].end_pfn = btop(end - 1);
811ce8eb11aSdp78419 
812ce8eb11aSdp78419 		/* Find the offset from the prev stripe boundary in PA space. */
813ce8eb11aSdp78419 		offset = (base + ra_to_pa) & (stripe - 1);
814ce8eb11aSdp78419 
815ce8eb11aSdp78419 		/* Set the next stripe boundary. */
816ce8eb11aSdp78419 		stripe_end = base - offset + stripe;
817ce8eb11aSdp78419 
818ce8eb11aSdp78419 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
819ce8eb11aSdp78419 		    home_mask_shift);
820ce8eb11aSdp78419 		lgrphand = lgrp_start;
821ce8eb11aSdp78419 
822ce8eb11aSdp78419 		/*
823ce8eb11aSdp78419 		 * Loop over all lgroups covered by the mblock, creating a
824ce8eb11aSdp78419 		 * stripe for each.  Stop when lgrp_start is visited again.
825ce8eb11aSdp78419 		 */
826ce8eb11aSdp78419 		do {
827ce8eb11aSdp78419 			/* mblock may not span all lgroups */
828ce8eb11aSdp78419 			if (base >= end)
829ce8eb11aSdp78419 				break;
830ce8eb11aSdp78419 
831ce8eb11aSdp78419 			mnode = lgrphand;
832ce8eb11aSdp78419 			ASSERT(mnode < max_mem_nodes);
833*bb57d1f5Sjc25722 			mpo_mblock[i].mnode_mask |= (mnodeset_t)1 << mnode;
834ce8eb11aSdp78419 
835ce8eb11aSdp78419 			/*
836ce8eb11aSdp78419 			 * Calculate the size of the fragment that does not
837ce8eb11aSdp78419 			 * belong to the mnode in the last partial stride.
838ce8eb11aSdp78419 			 */
839ce8eb11aSdp78419 			frag = (end - (base - offset)) & (stride - 1);
840ce8eb11aSdp78419 			if (frag == 0) {
841ce8eb11aSdp78419 				/* remove the gap */
842ce8eb11aSdp78419 				remove = stride - stripe;
843ce8eb11aSdp78419 			} else if (frag < stripe) {
844ce8eb11aSdp78419 				/* fragment fits in stripe; keep it all */
845ce8eb11aSdp78419 				remove = 0;
846ce8eb11aSdp78419 			} else {
847ce8eb11aSdp78419 				/* fragment is large; trim after whole stripe */
848ce8eb11aSdp78419 				remove = frag - stripe;
849ce8eb11aSdp78419 			}
850ce8eb11aSdp78419 
851ce8eb11aSdp78419 			ms = &mem_stripes[i * max_locality_groups + mnode];
852ce8eb11aSdp78419 			ms->physbase = btop(base);
853ce8eb11aSdp78419 			ms->physmax = btop(end - 1 - remove);
854ce8eb11aSdp78419 			ms->offset = btop(offset);
855ce8eb11aSdp78419 			ms->exists = 1;
856ce8eb11aSdp78419 
857e853d8c3Sjc25722 			/*
858e853d8c3Sjc25722 			 * If we have only 1 lgroup and multiple mblocks,
859e853d8c3Sjc25722 			 * then we have already established our lgrp handle
860e853d8c3Sjc25722 			 * to mem_node and mem_node_config values above.
861e853d8c3Sjc25722 			 */
862e853d8c3Sjc25722 			if (n_locality_groups > 1) {
863e853d8c3Sjc25722 				mpo_plat_assign_lgrphand_to_mem_node(lgrphand,
864e853d8c3Sjc25722 				    mnode);
865e853d8c3Sjc25722 				mpo_mem_node_add_slice(ms->physbase,
866e853d8c3Sjc25722 				    ms->physmax);
867e853d8c3Sjc25722 			}
868ce8eb11aSdp78419 			base = stripe_end;
869ce8eb11aSdp78419 			stripe_end += stripe;
870ce8eb11aSdp78419 			offset = 0;
871ce8eb11aSdp78419 			lgrphand = (((base + ra_to_pa) & home_mask) >>
872ce8eb11aSdp78419 			    home_mask_shift);
873ce8eb11aSdp78419 		} while (lgrphand != lgrp_start);
874ce8eb11aSdp78419 	}
875ce8eb11aSdp78419 
876ce8eb11aSdp78419 	/*
877ce8eb11aSdp78419 	 * Indicate to vm_pagelist that the hpm_counters array
878ce8eb11aSdp78419 	 * should be shared because the ranges overlap.
879ce8eb11aSdp78419 	 */
880ce8eb11aSdp78419 	if (max_mem_nodes > 1) {
881ce8eb11aSdp78419 		interleaved_mnodes = 1;
882ce8eb11aSdp78419 	}
883ce8eb11aSdp78419 }
884ce8eb11aSdp78419 
885ce8eb11aSdp78419 /*
886ce8eb11aSdp78419  * Return the locality group value for the supplied processor
887ce8eb11aSdp78419  */
888ce8eb11aSdp78419 lgrp_handle_t
889ce8eb11aSdp78419 plat_lgrp_cpu_to_hand(processorid_t id)
890ce8eb11aSdp78419 {
891ce8eb11aSdp78419 	if (n_locality_groups > 1) {
892ce8eb11aSdp78419 		return ((lgrp_handle_t)mpo_cpu[(int)id].home);
893ce8eb11aSdp78419 	} else {
894e853d8c3Sjc25722 		return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */
895ce8eb11aSdp78419 	}
896ce8eb11aSdp78419 }
897ce8eb11aSdp78419 
898ce8eb11aSdp78419 int
899ce8eb11aSdp78419 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
900ce8eb11aSdp78419 {
901ce8eb11aSdp78419 	/*
902ce8eb11aSdp78419 	 * Return min remote latency when there are more than two lgroups
903ce8eb11aSdp78419 	 * (root and child) and getting latency between two different lgroups
904ce8eb11aSdp78419 	 * or root is involved.
905ce8eb11aSdp78419 	 */
906ce8eb11aSdp78419 	if (lgrp_optimizations() && (from != to ||
907ce8eb11aSdp78419 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
908ce8eb11aSdp78419 		return ((int)higher_latency);
909ce8eb11aSdp78419 	} else {
910ce8eb11aSdp78419 		return ((int)lower_latency);
911ce8eb11aSdp78419 	}
912ce8eb11aSdp78419 }
913ce8eb11aSdp78419 
914ce8eb11aSdp78419 int
915ce8eb11aSdp78419 plat_pfn_to_mem_node(pfn_t pfn)
916ce8eb11aSdp78419 {
917ce8eb11aSdp78419 	int i, mnode;
918ce8eb11aSdp78419 	pfn_t ra_to_pa_pfn;
919ce8eb11aSdp78419 	struct mblock_md *mb;
920ce8eb11aSdp78419 
921ce8eb11aSdp78419 	if (n_locality_groups <= 1)
922ce8eb11aSdp78419 		return (0);
923ce8eb11aSdp78419 
924ce8eb11aSdp78419 	/*
925ce8eb11aSdp78419 	 * The mnode is defined to be 1:1 with the lgroup handle, which
926ce8eb11aSdp78419 	 * is taken from from the home bits.  Find the mblock in which
927ce8eb11aSdp78419 	 * the pfn falls to get the ra_to_pa adjustment, and extract
928ce8eb11aSdp78419 	 * the home bits.
929ce8eb11aSdp78419 	 */
930ce8eb11aSdp78419 	mb = &mpo_mblock[0];
931ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
932ce8eb11aSdp78419 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
933ce8eb11aSdp78419 			ra_to_pa_pfn = btop(mb->ra_to_pa);
934ce8eb11aSdp78419 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
935ce8eb11aSdp78419 			    home_mask_pfn_shift);
936ce8eb11aSdp78419 			ASSERT(mnode < max_mem_nodes);
937ce8eb11aSdp78419 			return (mnode);
938ce8eb11aSdp78419 		}
939ce8eb11aSdp78419 		mb++;
940ce8eb11aSdp78419 	}
941ce8eb11aSdp78419 
942ce8eb11aSdp78419 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
943ce8eb11aSdp78419 	return (pfn);
944ce8eb11aSdp78419 }
945ce8eb11aSdp78419 
946ce8eb11aSdp78419 /*
947ce8eb11aSdp78419  * plat_rapfn_to_papfn
948ce8eb11aSdp78419  *
949ce8eb11aSdp78419  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
950ce8eb11aSdp78419  * and home mask bits are correct.  The upper bits do not necessarily
951ce8eb11aSdp78419  * match the actual PA, however.
952ce8eb11aSdp78419  */
953ce8eb11aSdp78419 pfn_t
954ce8eb11aSdp78419 plat_rapfn_to_papfn(pfn_t pfn)
955ce8eb11aSdp78419 {
956ce8eb11aSdp78419 	int i;
957ce8eb11aSdp78419 	pfn_t ra_to_pa_pfn;
958ce8eb11aSdp78419 	struct mblock_md *mb;
959ce8eb11aSdp78419 
960ce8eb11aSdp78419 	ASSERT(n_mblocks > 0);
961ce8eb11aSdp78419 	if (n_mblocks == 1)
962ce8eb11aSdp78419 		return (pfn + base_ra_to_pa_pfn);
963ce8eb11aSdp78419 
964ce8eb11aSdp78419 	/*
965ce8eb11aSdp78419 	 * Find the mblock in which the pfn falls
966ce8eb11aSdp78419 	 * in order to get the ra_to_pa adjustment.
967ce8eb11aSdp78419 	 */
968ce8eb11aSdp78419 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
969ce8eb11aSdp78419 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
970ce8eb11aSdp78419 			ra_to_pa_pfn = btop(mb->ra_to_pa);
971ce8eb11aSdp78419 			return (pfn + ra_to_pa_pfn);
972ce8eb11aSdp78419 		}
973ce8eb11aSdp78419 	}
974ce8eb11aSdp78419 
975ce8eb11aSdp78419 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
976ce8eb11aSdp78419 	return (pfn);
977ce8eb11aSdp78419 }
978ce8eb11aSdp78419 
979ce8eb11aSdp78419 /*
980ce8eb11aSdp78419  * plat_mem_node_iterator_init()
981ce8eb11aSdp78419  *	Initialize cookie to iterate over pfn's in an mnode.  There is
982ce8eb11aSdp78419  *	no additional iterator function.  The caller uses the info from
983ce8eb11aSdp78419  *	the iterator structure directly.
984ce8eb11aSdp78419  *
985ce8eb11aSdp78419  *	pfn: starting pfn.
986ce8eb11aSdp78419  * 	mnode: desired mnode.
987ce8eb11aSdp78419  *	init: set to 1 for full init, 0 for continuation
988ce8eb11aSdp78419  *
989ce8eb11aSdp78419  *	Returns the appropriate starting pfn for the iteration
990ce8eb11aSdp78419  *	the same as the input pfn if it falls in an mblock.
991ce8eb11aSdp78419  *	Returns the (pfn_t)-1 value if the input pfn lies past
992ce8eb11aSdp78419  *	the last valid mnode pfn.
993ce8eb11aSdp78419  */
994ce8eb11aSdp78419 pfn_t
995ce8eb11aSdp78419 plat_mem_node_iterator_init(pfn_t pfn, int mnode,
996ce8eb11aSdp78419     mem_node_iterator_t *it, int init)
997ce8eb11aSdp78419 {
998ce8eb11aSdp78419 	int i;
999ce8eb11aSdp78419 	struct mblock_md *mblock;
1000ce8eb11aSdp78419 	pfn_t base, end;
1001ce8eb11aSdp78419 
1002ce8eb11aSdp78419 	ASSERT(it != NULL);
1003ce8eb11aSdp78419 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1004ce8eb11aSdp78419 	ASSERT(n_mblocks > 0);
1005ce8eb11aSdp78419 
1006ce8eb11aSdp78419 	if (init) {
1007ce8eb11aSdp78419 		it->mi_last_mblock = 0;
1008ce8eb11aSdp78419 		it->mi_init = 1;
1009ce8eb11aSdp78419 	}
1010ce8eb11aSdp78419 
1011ce8eb11aSdp78419 	/* Check if mpo is not enabled and we only have one mblock */
1012ce8eb11aSdp78419 	if (n_locality_groups == 1 && n_mblocks == 1) {
1013ce8eb11aSdp78419 		it->mi_mnode = mnode;
1014ce8eb11aSdp78419 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
1015ce8eb11aSdp78419 		it->mi_mnode_pfn_mask = 0;
1016ce8eb11aSdp78419 		it->mi_mnode_pfn_shift = 0;
1017ce8eb11aSdp78419 		it->mi_mnode_mask = 0;
1018ce8eb11aSdp78419 		it->mi_mblock_base = mem_node_config[mnode].physbase;
1019ce8eb11aSdp78419 		it->mi_mblock_end = mem_node_config[mnode].physmax;
1020ce8eb11aSdp78419 		if (pfn < it->mi_mblock_base)
1021ce8eb11aSdp78419 			pfn = it->mi_mblock_base;
1022ce8eb11aSdp78419 		else if (pfn > it->mi_mblock_end)
1023ce8eb11aSdp78419 			pfn = (pfn_t)-1;
1024ce8eb11aSdp78419 		return (pfn);
1025ce8eb11aSdp78419 	}
1026ce8eb11aSdp78419 
1027ce8eb11aSdp78419 	/*
1028ce8eb11aSdp78419 	 * Find mblock that contains pfn, or first mblock after pfn,
1029ce8eb11aSdp78419 	 * else pfn is out of bounds, so use the last mblock.
1030ce8eb11aSdp78419 	 * mblocks are sorted in ascending address order.
1031ce8eb11aSdp78419 	 */
1032ce8eb11aSdp78419 	ASSERT(it->mi_last_mblock < n_mblocks);
1033ce8eb11aSdp78419 	ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
1034ce8eb11aSdp78419 	i = init ? 0 : it->mi_last_mblock + 1;
1035ce8eb11aSdp78419 	if (i == n_mblocks)
1036ce8eb11aSdp78419 		return ((pfn_t)-1);
1037ce8eb11aSdp78419 
1038ce8eb11aSdp78419 	for (; i < n_mblocks; i++) {
1039*bb57d1f5Sjc25722 		if ((mpo_mblock[i].mnode_mask & ((mnodeset_t)1 << mnode)) &&
1040*bb57d1f5Sjc25722 		    (pfn <= mpo_mblock[i].end_pfn))
1041ce8eb11aSdp78419 			break;
1042ce8eb11aSdp78419 	}
1043ce8eb11aSdp78419 	if (i == n_mblocks) {
1044ce8eb11aSdp78419 		it->mi_last_mblock = i - 1;
1045ce8eb11aSdp78419 		return ((pfn_t)-1);
1046ce8eb11aSdp78419 	}
1047ce8eb11aSdp78419 	it->mi_last_mblock = i;
1048ce8eb11aSdp78419 
1049ce8eb11aSdp78419 	/*
1050ce8eb11aSdp78419 	 * Memory stripes are defined if there is more than one locality
1051ce8eb11aSdp78419 	 * group, so use the stripe bounds.  Otherwise use mblock bounds.
1052ce8eb11aSdp78419 	 */
1053ce8eb11aSdp78419 	mblock = &mpo_mblock[i];
1054ce8eb11aSdp78419 	if (n_mem_stripes > 0) {
1055ce8eb11aSdp78419 		mem_stripe_t *ms =
1056ce8eb11aSdp78419 		    &mem_stripes[i * max_locality_groups + mnode];
1057ce8eb11aSdp78419 		base = ms->physbase;
1058ce8eb11aSdp78419 		end = ms->physmax;
1059ce8eb11aSdp78419 	} else {
1060ce8eb11aSdp78419 		ASSERT(mnode == 0);
1061ce8eb11aSdp78419 		base = mblock->base_pfn;
1062ce8eb11aSdp78419 		end = mblock->end_pfn;
1063ce8eb11aSdp78419 	}
1064ce8eb11aSdp78419 
1065ce8eb11aSdp78419 	it->mi_mnode = mnode;
1066ce8eb11aSdp78419 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1067ce8eb11aSdp78419 	it->mi_mblock_base = base;
1068ce8eb11aSdp78419 	it->mi_mblock_end = end;
1069ce8eb11aSdp78419 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1070ce8eb11aSdp78419 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1071ce8eb11aSdp78419 	it->mi_mnode_mask = max_locality_groups - 1;
1072ce8eb11aSdp78419 	if (pfn < base)
1073ce8eb11aSdp78419 		pfn = base;
1074ce8eb11aSdp78419 	else if (pfn > end)
1075ce8eb11aSdp78419 		pfn = (pfn_t)-1;
1076ce8eb11aSdp78419 	return (pfn);
1077ce8eb11aSdp78419 }
1078ce8eb11aSdp78419 
1079ce8eb11aSdp78419 /*
1080ce8eb11aSdp78419  * plat_mem_node_intersect_range()
1081ce8eb11aSdp78419  *
1082ce8eb11aSdp78419  * Find the intersection between a memnode and a range of pfn's.
1083ce8eb11aSdp78419  */
1084ce8eb11aSdp78419 void
1085ce8eb11aSdp78419 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1086ce8eb11aSdp78419     int mnode, pgcnt_t *npages_out)
1087ce8eb11aSdp78419 {
1088ce8eb11aSdp78419 	pfn_t offset, len, hole, base, end, test_end, frag;
1089ce8eb11aSdp78419 	pfn_t nearest;
1090ce8eb11aSdp78419 	mem_stripe_t *ms;
1091ce8eb11aSdp78419 	int i, npages;
1092ce8eb11aSdp78419 
1093ce8eb11aSdp78419 	*npages_out = 0;
1094ce8eb11aSdp78419 
1095ce8eb11aSdp78419 	if (!mem_node_config[mnode].exists || test_len == 0)
1096ce8eb11aSdp78419 		return;
1097ce8eb11aSdp78419 
1098ce8eb11aSdp78419 	base = mem_node_config[mnode].physbase;
1099ce8eb11aSdp78419 	end = mem_node_config[mnode].physmax;
1100ce8eb11aSdp78419 
1101ce8eb11aSdp78419 	test_end = test_base + test_len - 1;
1102ce8eb11aSdp78419 	if (end < test_base || base > test_end)
1103ce8eb11aSdp78419 		return;
1104ce8eb11aSdp78419 
1105ce8eb11aSdp78419 	if (n_locality_groups == 1) {
1106ce8eb11aSdp78419 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1107ce8eb11aSdp78419 		return;
1108ce8eb11aSdp78419 	}
1109ce8eb11aSdp78419 
1110ce8eb11aSdp78419 	hole = mnode_stride - mnode_pages;
1111ce8eb11aSdp78419 	npages = 0;
1112ce8eb11aSdp78419 
1113ce8eb11aSdp78419 	/*
1114ce8eb11aSdp78419 	 * Iterate over all the stripes for this mnode (one per mblock),
1115ce8eb11aSdp78419 	 * find the intersection with each, and accumulate the intersections.
1116ce8eb11aSdp78419 	 *
1117ce8eb11aSdp78419 	 * Determing the intersection with a stripe is tricky.  If base or end
1118ce8eb11aSdp78419 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1119ce8eb11aSdp78419 	 * mem_node.  If base or end fall in a gap, round them to start of
1120ce8eb11aSdp78419 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1121ce8eb11aSdp78419 	 * but calculate the fragment size that should be excluded from the
1122ce8eb11aSdp78419 	 * stripe.  Calculate how many strides fall in the adjusted range,
1123ce8eb11aSdp78419 	 * multiply by stripe width, and add the start and end fragments.
1124ce8eb11aSdp78419 	 */
1125ce8eb11aSdp78419 
1126ce8eb11aSdp78419 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1127ce8eb11aSdp78419 		ms = &mem_stripes[i];
1128ce8eb11aSdp78419 		if (ms->exists &&
1129ce8eb11aSdp78419 		    test_base <= (end = ms->physmax) &&
1130ce8eb11aSdp78419 		    test_end >= (base = ms->physbase)) {
1131ce8eb11aSdp78419 
1132ce8eb11aSdp78419 			offset = ms->offset;
1133ce8eb11aSdp78419 
1134ce8eb11aSdp78419 			if (test_base > base) {
1135ce8eb11aSdp78419 				/* Round test_base to next multiple of stride */
1136ce8eb11aSdp78419 				len = P2ROUNDUP(test_base - (base - offset),
1137ce8eb11aSdp78419 				    mnode_stride);
1138ce8eb11aSdp78419 				nearest = base - offset + len;
1139ce8eb11aSdp78419 				/*
1140ce8eb11aSdp78419 				 * Compute distance from test_base to the
1141ce8eb11aSdp78419 				 * stride boundary to see if test_base falls
1142ce8eb11aSdp78419 				 * in the stripe or in the hole.
1143ce8eb11aSdp78419 				 */
1144ce8eb11aSdp78419 				if (nearest - test_base > hole) {
1145ce8eb11aSdp78419 					/*
1146ce8eb11aSdp78419 					 * test_base lies in stripe,
1147ce8eb11aSdp78419 					 * and offset should be excluded.
1148ce8eb11aSdp78419 					 */
1149ce8eb11aSdp78419 					offset = test_base -
1150ce8eb11aSdp78419 					    (nearest - mnode_stride);
1151ce8eb11aSdp78419 					base = test_base;
1152ce8eb11aSdp78419 				} else {
1153ce8eb11aSdp78419 					/* round up to next stripe start */
1154ce8eb11aSdp78419 					offset = 0;
1155ce8eb11aSdp78419 					base = nearest;
1156ce8eb11aSdp78419 					if (base > end)
1157ce8eb11aSdp78419 						continue;
1158ce8eb11aSdp78419 				}
1159ce8eb11aSdp78419 
1160ce8eb11aSdp78419 			}
1161ce8eb11aSdp78419 
1162ce8eb11aSdp78419 			if (test_end < end)
1163ce8eb11aSdp78419 				end = test_end;
1164ce8eb11aSdp78419 			end++;		/* adjust to an exclusive bound */
1165ce8eb11aSdp78419 
1166ce8eb11aSdp78419 			/* Round end to next multiple of stride */
1167ce8eb11aSdp78419 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1168ce8eb11aSdp78419 			nearest = (base - offset) + len;
1169ce8eb11aSdp78419 			if (nearest - end <= hole) {
1170ce8eb11aSdp78419 				/* end falls in hole, use entire last stripe */
1171ce8eb11aSdp78419 				frag = 0;
1172ce8eb11aSdp78419 			} else {
1173ce8eb11aSdp78419 				/* end falls in stripe, compute fragment */
1174ce8eb11aSdp78419 				frag = nearest - hole - end;
1175ce8eb11aSdp78419 			}
1176ce8eb11aSdp78419 
1177ce8eb11aSdp78419 			len = (len >> stripe_shift) - offset - frag;
1178ce8eb11aSdp78419 			npages += len;
1179ce8eb11aSdp78419 		}
1180ce8eb11aSdp78419 	}
1181ce8eb11aSdp78419 
1182ce8eb11aSdp78419 	*npages_out = npages;
1183ce8eb11aSdp78419 }
1184ce8eb11aSdp78419 
1185ce8eb11aSdp78419 /*
1186ce8eb11aSdp78419  * valid_pages()
1187ce8eb11aSdp78419  *
1188ce8eb11aSdp78419  * Return 1 if pages are valid and do not cross mnode boundaries
1189ce8eb11aSdp78419  * (which would break page free list assumptions), and 0 otherwise.
1190ce8eb11aSdp78419  */
1191ce8eb11aSdp78419 
1192ce8eb11aSdp78419 #define	MNODE(pa)	\
1193ce8eb11aSdp78419 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1194ce8eb11aSdp78419 
1195ce8eb11aSdp78419 static int
1196ce8eb11aSdp78419 valid_pages(md_t *md, mde_cookie_t cpu0)
1197ce8eb11aSdp78419 {
1198ce8eb11aSdp78419 	int i, max_szc;
1199ce8eb11aSdp78419 	uint64_t last_page_base, szc_mask;
1200ce8eb11aSdp78419 	uint64_t max_page_len, max_coalesce_len;
1201ce8eb11aSdp78419 	struct mblock_md *mb = mpo_mblock;
1202ce8eb11aSdp78419 
1203ce8eb11aSdp78419 	/*
1204ce8eb11aSdp78419 	 * Find the smaller of the largest page possible and supported.
1205ce8eb11aSdp78419 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1206ce8eb11aSdp78419 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1207ce8eb11aSdp78419 	 * to get a sane mask.
1208ce8eb11aSdp78419 	 */
1209ce8eb11aSdp78419 
1210ce8eb11aSdp78419 	if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1211ce8eb11aSdp78419 		szc_mask = 0;
1212ce8eb11aSdp78419 	szc_mask |=  (1 << TTE4M);	/* largest in sun4v default support */
1213ce8eb11aSdp78419 	max_szc = highbit(szc_mask) - 1;
1214ce8eb11aSdp78419 	if (max_szc > TTE256M)
1215ce8eb11aSdp78419 		max_szc = TTE256M;
1216ce8eb11aSdp78419 	max_page_len = TTEBYTES(max_szc);
1217ce8eb11aSdp78419 
1218ce8eb11aSdp78419 	/*
1219ce8eb11aSdp78419 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1220ce8eb11aSdp78419 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1221ce8eb11aSdp78419 	 * within one mnode to use MPO.
1222ce8eb11aSdp78419 	 */
1223ce8eb11aSdp78419 	max_coalesce_len = TTEBYTES(TTE256M);
1224ce8eb11aSdp78419 	ASSERT(max_coalesce_len >= max_page_len);
1225ce8eb11aSdp78419 
1226ce8eb11aSdp78419 	if (ptob(mnode_pages) < max_coalesce_len) {
1227ce8eb11aSdp78419 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1228ce8eb11aSdp78419 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1229ce8eb11aSdp78419 		return (0);
1230ce8eb11aSdp78419 	}
1231ce8eb11aSdp78419 
1232ce8eb11aSdp78419 	for (i = 0; i < n_mblocks; i++) {
1233ce8eb11aSdp78419 		uint64_t base = mb->base;
1234ce8eb11aSdp78419 		uint64_t end = mb->base + mb->size - 1;
1235ce8eb11aSdp78419 		uint64_t ra_to_pa = mb->ra_to_pa;
1236ce8eb11aSdp78419 
1237ce8eb11aSdp78419 		/*
1238ce8eb11aSdp78419 		 * If mblock is smaller than the max page size, then
1239ce8eb11aSdp78419 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1240ce8eb11aSdp78419 		 * not span mnodes.
1241ce8eb11aSdp78419 		 */
1242ce8eb11aSdp78419 		if (mb->size < max_page_len) {
1243ce8eb11aSdp78419 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1244ce8eb11aSdp78419 				MPO_STATUS("Small mblock spans mnodes; "
1245ce8eb11aSdp78419 				    "MPO disabled: base = %lx, end = %lx, "
1246ce8eb11aSdp78419 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1247ce8eb11aSdp78419 				return (0);
1248ce8eb11aSdp78419 			}
1249ce8eb11aSdp78419 		} else {
1250ce8eb11aSdp78419 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1251ce8eb11aSdp78419 			uint64_t pa_base = base + ra_to_pa;
1252ce8eb11aSdp78419 			if ((base & (max_coalesce_len - 1)) !=
1253ce8eb11aSdp78419 			    (pa_base & (max_coalesce_len - 1))) {
1254ce8eb11aSdp78419 				MPO_STATUS("bad page alignment; MPO disabled: "
1255ce8eb11aSdp78419 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1256ce8eb11aSdp78419 				    base, pa_base, max_coalesce_len);
1257ce8eb11aSdp78419 				return (0);
1258ce8eb11aSdp78419 			}
1259ce8eb11aSdp78419 		}
1260ce8eb11aSdp78419 
1261ce8eb11aSdp78419 		/*
1262ce8eb11aSdp78419 		 * Find start of last large page in mblock in RA space.
1263ce8eb11aSdp78419 		 * If page extends into the next mblock, verify the
1264ce8eb11aSdp78419 		 * mnode does not change.
1265ce8eb11aSdp78419 		 */
1266ce8eb11aSdp78419 		last_page_base = P2ALIGN(end, max_coalesce_len);
1267ce8eb11aSdp78419 		if (i + 1 < n_mblocks &&
1268ce8eb11aSdp78419 		    last_page_base + max_coalesce_len > mb[1].base &&
1269ce8eb11aSdp78419 		    MNODE(last_page_base + ra_to_pa) !=
1270ce8eb11aSdp78419 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1271ce8eb11aSdp78419 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1272ce8eb11aSdp78419 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1273ce8eb11aSdp78419 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1274ce8eb11aSdp78419 			    mb[1].ra_to_pa, max_coalesce_len);
1275ce8eb11aSdp78419 			return (0);
1276ce8eb11aSdp78419 		}
1277ce8eb11aSdp78419 
1278ce8eb11aSdp78419 		mb++;
1279ce8eb11aSdp78419 	}
1280ce8eb11aSdp78419 	return (1);
1281ce8eb11aSdp78419 }
1282ce8eb11aSdp78419 
1283ce8eb11aSdp78419 
1284ce8eb11aSdp78419 /*
1285ce8eb11aSdp78419  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1286ce8eb11aSdp78419  * if any, and remove them.  This yields a config where the "coarse
1287ce8eb11aSdp78419  * grained" lgroups cover all of memory, even though part of that memory
1288ce8eb11aSdp78419  * is fine grain interleaved and does not deliver a purely local memory
1289ce8eb11aSdp78419  * latency.
1290ce8eb11aSdp78419  *
1291ce8eb11aSdp78419  * This function reads and modifies the globals:
1292ce8eb11aSdp78419  *	mpo_lgroup[], n_lgrpnodes
1293ce8eb11aSdp78419  *
1294ce8eb11aSdp78419  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1295ce8eb11aSdp78419  */
1296ce8eb11aSdp78419 
1297ce8eb11aSdp78419 static int
1298ce8eb11aSdp78419 fix_interleave(void)
1299ce8eb11aSdp78419 {
1300ce8eb11aSdp78419 	int i, j;
1301ce8eb11aSdp78419 	uint64_t mask = 0;
1302ce8eb11aSdp78419 
1303ce8eb11aSdp78419 	j = 0;
1304ce8eb11aSdp78419 	for (i = 0; i < n_lgrpnodes; i++) {
1305ce8eb11aSdp78419 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1306ce8eb11aSdp78419 			/* remove this lgroup */
1307ce8eb11aSdp78419 			mask = mpo_lgroup[i].addr_mask;
1308ce8eb11aSdp78419 		} else {
1309ce8eb11aSdp78419 			mpo_lgroup[j++] = mpo_lgroup[i];
1310ce8eb11aSdp78419 		}
1311ce8eb11aSdp78419 	}
1312ce8eb11aSdp78419 	n_lgrpnodes = j;
1313ce8eb11aSdp78419 
1314ce8eb11aSdp78419 	if (mask != 0)
1315ce8eb11aSdp78419 		MPO_STATUS("sub-page interleave %lx found; "
1316ce8eb11aSdp78419 		    "removing lgroup.\n", mask);
1317ce8eb11aSdp78419 
1318ce8eb11aSdp78419 	return (mask != 0);
1319ce8eb11aSdp78419 }
1320