xref: /titanic_50/usr/src/uts/sun4v/os/mpo.c (revision 3c4993fb5a74112f361d71dab20997bdc749a7fb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/types.h>
30 #include <sys/sysmacros.h>
31 #include <sys/machsystm.h>
32 #include <sys/machparam.h>
33 #include <sys/cmn_err.h>
34 #include <sys/stat.h>
35 #include <sys/mach_descrip.h>
36 #include <sys/memnode.h>
37 #include <sys/mdesc.h>
38 #include <sys/mpo.h>
39 #include <vm/vm_dep.h>
40 #include <vm/hat_sfmmu.h>
41 #include <sys/promif.h>
42 
43 /*
44  * MPO and the sun4v memory representation
45  * ---------------------------------------
46  *
47  * Latency groups are defined in the sun4v achitecture by memory-latency-group
48  * nodes in the Machine Description, as specified in FWARC/2007/260.  These
49  * tie together cpu nodes and mblock nodes, and contain mask and match
50  * properties that identify the portion of an mblock that belongs to the
51  * lgroup.  Mask and match are defined in the Physical Address (PA) space,
52  * but an mblock defines Real Addresses (RA).  To translate, the mblock
53  * includes the property address-congruence-offset, hereafter referred to as
54  * ra_to_pa.  A real address ra is a member of an lgroup if
55  *
56  *	(ra + mblock.ra_to_pa) & lgroup.mask == lgroup.match
57  *
58  * The MD is traversed, and information on all mblocks is kept in the array
59  * mpo_mblock[].  Information on all CPUs, including which lgroup they map
60  * to, is kept in the array mpo_cpu[].
61  *
62  * This implementation makes (and verifies) the simplifying assumption that
63  * the mask bits are the same for all defined lgroups, and that all 1 bits in
64  * the mask are contiguous.  Thus the number of lgroups is bounded by the
65  * number of possible mask values, and the lgrp_handle_t is defined as the
66  * mask value, shifted right to eliminate the 0 bit positions in mask.  The
67  * masks and values are also referred to as "home bits" in the code.
68  *
69  * A mem_node is defined to be 1:1 with an lgrp_handle_t, thus each lgroup
70  * has exactly 1 mem_node, and plat_pfn_to_mem_node() must find the mblock
71  * containing a pfn, apply the mblock's ra_to_pa adjustment, and extract the
72  * home bits.  This yields the mem_node.
73  *
74  * Interfaces
75  * ----------
76  *
77  * This file exports the following entry points:
78  *
79  * plat_lgrp_init()
80  * plat_build_mem_nodes()
81  * plat_lgrp_cpu_to_hand()
82  * plat_lgrp_latency()
83  * plat_pfn_to_mem_node()
84  *	These implement the usual platform lgroup interfaces.
85  *
86  * plat_rapfn_to_papfn()
87  *	Recover the PA page coloring bits from an RA.
88  *
89  * plat_mem_node_iterator_init()
90  *	Initialize an iterator to efficiently step through pages in a mem_node.
91  *
92  * plat_mem_node_intersect_range()
93  *	Find the intersection with a mem_node.
94  */
95 
96 int	sun4v_mpo_enable = 1;
97 int	sun4v_mpo_debug = 0;
98 char	sun4v_mpo_status[256] = "";
99 
100 /* Save CPU info from the MD and associate CPUs with lgroups */
101 static	struct cpu_md mpo_cpu[NCPU];
102 
103 /* Save lgroup info from the MD */
104 #define	MAX_MD_LGROUPS 32
105 static	struct	lgrp_md mpo_lgroup[MAX_MD_LGROUPS];
106 static	int	n_lgrpnodes = 0;
107 static	int	n_locality_groups = 0;
108 static	int	max_locality_groups = 0;
109 
110 /* Save mblocks from the MD */
111 #define	SMALL_MBLOCKS_COUNT	8
112 static 	struct	mblock_md *mpo_mblock;
113 static	struct 	mblock_md small_mpo_mblocks[SMALL_MBLOCKS_COUNT];
114 static	int	n_mblocks = 0;
115 
116 /* Save mem_node stripes calculate from mblocks and lgroups. */
117 static mem_stripe_t *mem_stripes;
118 static	mem_stripe_t small_mem_stripes[SMALL_MBLOCKS_COUNT * MAX_MEM_NODES];
119 static	int 	mstripesz = 0;
120 static	int	n_mem_stripes = 0;
121 static	pfn_t	mnode_stride;	/* distance between stripes, start to start */
122 static	int	stripe_shift;	/* stride/stripes expressed as a shift */
123 static	pfn_t	mnode_pages;	/* mem_node stripe width */
124 
125 /* Save home mask and shift used to calculate lgrp_handle_t values */
126 static	uint64_t home_mask = 0;
127 static	pfn_t	home_mask_pfn = 0;
128 static	int	home_mask_shift = 0;
129 static	uint_t	home_mask_pfn_shift = 0;
130 
131 /* Save lowest and highest latencies found across all lgroups */
132 static	int	lower_latency = 0;
133 static	int	higher_latency = 0;
134 
135 static	pfn_t	base_ra_to_pa_pfn = 0;	/* ra_to_pa for single mblock memory */
136 
137 static	int	valid_pages(md_t *md, mde_cookie_t cpu0);
138 static	int	unique_home_mem_lg_count(uint64_t mem_lg_homeset);
139 static	int	fix_interleave(void);
140 
141 /* Debug support */
142 #if defined(DEBUG) && !defined(lint)
143 #define	MPO_DEBUG(args...) if (sun4v_mpo_debug) printf(args)
144 #else
145 #define	MPO_DEBUG(...)
146 #endif	/* DEBUG */
147 
148 /* Record status message, viewable from mdb */
149 #define	MPO_STATUS(args...) {						      \
150 	(void) snprintf(sun4v_mpo_status, sizeof (sun4v_mpo_status), args);   \
151 	MPO_DEBUG(sun4v_mpo_status);					      \
152 }
153 
154 /*
155  * Routine to read a uint64_t from a given md
156  */
157 static	int64_t
158 get_int(md_t md, mde_cookie_t node, char *propname, uint64_t *val)
159 {
160 	int err = md_get_prop_val(md, node, propname, val);
161 	return (err);
162 }
163 
164 static int
165 mblock_cmp(const void *a, const void *b)
166 {
167 	struct mblock_md *m1 = (struct mblock_md *)a;
168 	struct mblock_md *m2 = (struct mblock_md *)b;
169 
170 	if (m1->base < m2->base)
171 		return (-1);
172 	else if (m1->base == m2->base)
173 		return (0);
174 	else
175 		return (1);
176 }
177 
178 static void
179 mblock_sort(struct mblock_md *mblocks, int n)
180 {
181 	extern void qsort(void *, size_t, size_t,
182 	    int (*)(const void *, const void *));
183 
184 	qsort(mblocks, n, sizeof (mblocks[0]), mblock_cmp);
185 }
186 
187 /*
188  *
189  * Traverse the MD to determine:
190  *
191  *  Number of CPU nodes, lgrp_nodes, and mblocks
192  *  Then for each lgrp_node, obtain the appropriate data.
193  *  For each CPU, determine its home locality and store it.
194  *  For each mblock, retrieve its data and store it.
195  */
196 static	int
197 lgrp_traverse(md_t *md)
198 {
199 	mde_cookie_t root, *cpunodes, *lgrpnodes, *nodes, *mblocknodes;
200 	uint64_t i, j, k, o, n_nodes;
201 	uint64_t n_lgroups = 0;
202 	uint64_t mem_lg_homeset = 0;
203 	int ret_val = 0;
204 	int result = 0;
205 	int n_cpunodes = 0;
206 	int sub_page_fix;
207 	int mblocksz = 0;
208 	size_t allocsz;
209 
210 	n_nodes = md_node_count(md);
211 
212 	if (n_nodes <= 0) {
213 		MPO_STATUS("lgrp_traverse: No nodes in node count\n");
214 		ret_val = -1;
215 		goto fail;
216 	}
217 
218 	root = md_root_node(md);
219 
220 	if (root == MDE_INVAL_ELEM_COOKIE) {
221 		MPO_STATUS("lgrp_traverse: Root node is missing\n");
222 		ret_val = -1;
223 		goto fail;
224 	}
225 
226 	/*
227 	 * Build the Memory Nodes.  Do this before any possibility of
228 	 * bailing from this routine so we obtain ra_to_pa (needed for page
229 	 * coloring) even when there are no lgroups defined.
230 	 */
231 
232 	n_mblocks = md_alloc_scan_dag(md, root, PROP_LG_MBLOCK,
233 	    "fwd", &mblocknodes);
234 
235 	if (n_mblocks <= 0) {
236 		MPO_STATUS("lgrp_traverse: No mblock "
237 		    "nodes detected in Machine Descriptor\n");
238 		n_mblocks = 0;
239 		ret_val = -1;
240 		goto fail;
241 	}
242 	/*
243 	 * If we have a small number of mblocks we will use the space
244 	 * that we preallocated. Otherwise, we will dynamically
245 	 * allocate the space
246 	 */
247 	mblocksz = n_mblocks * sizeof (struct mblock_md);
248 	mstripesz = MAX_MEM_NODES * n_mblocks * sizeof (mem_stripe_t);
249 
250 	if (n_mblocks <= SMALL_MBLOCKS_COUNT) {
251 		mpo_mblock = &small_mpo_mblocks[0];
252 		mem_stripes = &small_mem_stripes[0];
253 	} else {
254 		allocsz = mmu_ptob(mmu_btopr(mblocksz + mstripesz));
255 		/* Ensure that we dont request more space than reserved */
256 		if (allocsz > MPOBUF_SIZE) {
257 			MPO_STATUS("lgrp_traverse: Insufficient space "
258 			    "for mblock structures \n");
259 			ret_val = -1;
260 			n_mblocks = 0;
261 			goto fail;
262 		}
263 		mpo_mblock = (struct mblock_md *)
264 		    prom_alloc((caddr_t)MPOBUF_BASE, allocsz, PAGESIZE);
265 		if (mpo_mblock != (struct mblock_md *)MPOBUF_BASE) {
266 			MPO_STATUS("lgrp_traverse: Cannot allocate space "
267 			    "for mblocks \n");
268 			ret_val = -1;
269 			n_mblocks = 0;
270 			goto fail;
271 		}
272 		mpo_heap32_buf = (caddr_t)MPOBUF_BASE;
273 		mpo_heap32_bufsz = MPOBUF_SIZE;
274 
275 		mem_stripes = (mem_stripe_t *)(mpo_mblock + n_mblocks);
276 	}
277 
278 	for (i = 0; i < n_mblocks; i++) {
279 		mpo_mblock[i].node = mblocknodes[i];
280 		mpo_mblock[i].mnode_mask = (mnodeset_t)0;
281 
282 		/* Without a base or size value we will fail */
283 		result = get_int(md, mblocknodes[i], PROP_LG_BASE,
284 		    &mpo_mblock[i].base);
285 		if (result < 0) {
286 			MPO_STATUS("lgrp_traverse: "
287 			    "PROP_LG_BASE is missing\n");
288 			n_mblocks = 0;
289 			ret_val = -1;
290 			goto fail;
291 		}
292 
293 		result = get_int(md, mblocknodes[i], PROP_LG_SIZE,
294 		    &mpo_mblock[i].size);
295 		if (result < 0) {
296 			MPO_STATUS("lgrp_traverse: "
297 			    "PROP_LG_SIZE is missing\n");
298 			n_mblocks = 0;
299 			ret_val = -1;
300 			goto fail;
301 		}
302 
303 		result = get_int(md, mblocknodes[i],
304 		    PROP_LG_RA_PA_OFFSET, &mpo_mblock[i].ra_to_pa);
305 
306 		/* If we don't have an ra_pa_offset, just set it to 0 */
307 		if (result < 0)
308 			mpo_mblock[i].ra_to_pa = 0;
309 
310 		MPO_DEBUG("mblock[%ld]: base = %lx, size = %lx, "
311 		    "ra_to_pa = %lx\n", i,
312 		    mpo_mblock[i].base,
313 		    mpo_mblock[i].size,
314 		    mpo_mblock[i].ra_to_pa);
315 	}
316 
317 	/* Must sort mblocks by address for mem_node_iterator_init() */
318 	mblock_sort(mpo_mblock, n_mblocks);
319 
320 	base_ra_to_pa_pfn = btop(mpo_mblock[0].ra_to_pa);
321 
322 	/* Page coloring hook is required so we can iterate through mnodes */
323 	if (&page_next_pfn_for_color_cpu == NULL) {
324 		MPO_STATUS("lgrp_traverse: No page coloring support\n");
325 		ret_val = -1;
326 		goto fail;
327 	}
328 
329 	/* Global enable for mpo */
330 	if (sun4v_mpo_enable == 0) {
331 		MPO_STATUS("lgrp_traverse: MPO feature is not enabled\n");
332 		ret_val = -1;
333 		goto fail;
334 	}
335 
336 	n_lgrpnodes = md_alloc_scan_dag(md, root, PROP_LG_MEM_LG,
337 	    "fwd", &lgrpnodes);
338 
339 	if (n_lgrpnodes <= 0 || n_lgrpnodes >= MAX_MD_LGROUPS) {
340 		MPO_STATUS("lgrp_traverse: No Lgroups\n");
341 		ret_val = -1;
342 		goto fail;
343 	}
344 
345 	n_cpunodes = md_alloc_scan_dag(md, root, PROP_LG_CPU, "fwd", &cpunodes);
346 
347 	if (n_cpunodes <= 0 || n_cpunodes > NCPU) {
348 		MPO_STATUS("lgrp_traverse: No CPU nodes detected "
349 		    "in MD\n");
350 		ret_val = -1;
351 		goto fail;
352 	}
353 
354 	MPO_DEBUG("lgrp_traverse: Node Count: %ld\n", n_nodes);
355 	MPO_DEBUG("lgrp_traverse: md: %p\n", md);
356 	MPO_DEBUG("lgrp_traverse: root: %lx\n", root);
357 	MPO_DEBUG("lgrp_traverse: mem_lgs: %d\n", n_lgrpnodes);
358 	MPO_DEBUG("lgrp_traverse: cpus: %d\n", n_cpunodes);
359 	MPO_DEBUG("lgrp_traverse: mblocks: %d\n", n_mblocks);
360 
361 	for (i = 0; i < n_lgrpnodes; i++) {
362 		mpo_lgroup[i].node = lgrpnodes[i];
363 		mpo_lgroup[i].id = i;
364 		mpo_lgroup[i].ncpu = 0;
365 		result = get_int(md, lgrpnodes[i], PROP_LG_MASK,
366 		    &mpo_lgroup[i].addr_mask);
367 		result |= get_int(md, lgrpnodes[i], PROP_LG_MATCH,
368 		    &mpo_lgroup[i].addr_match);
369 
370 		/*
371 		 * If either the mask or match properties are missing, set to 0
372 		 */
373 		if (result < 0) {
374 			mpo_lgroup[i].addr_mask = 0;
375 			mpo_lgroup[i].addr_match = 0;
376 		}
377 
378 		/* Set latency to 0 if property not present */
379 
380 		result = get_int(md, lgrpnodes[i], PROP_LG_LATENCY,
381 		    &mpo_lgroup[i].latency);
382 		if (result < 0)
383 			mpo_lgroup[i].latency = 0;
384 	}
385 
386 	/*
387 	 * Sub-page level interleave is not yet supported.  Check for it,
388 	 * and remove sub-page interleaved lgroups from mpo_lgroup and
389 	 * n_lgrpnodes.  If no lgroups are left, return.
390 	 */
391 
392 	sub_page_fix = fix_interleave();
393 	if (n_lgrpnodes == 0) {
394 		ret_val = -1;
395 		goto fail;
396 	}
397 
398 	/* Ensure that all of the addr_mask values are the same */
399 
400 	for (i = 0; i < n_lgrpnodes; i++) {
401 		if (mpo_lgroup[0].addr_mask != mpo_lgroup[i].addr_mask) {
402 			MPO_STATUS("lgrp_traverse: "
403 			    "addr_mask values are not the same\n");
404 			ret_val = -1;
405 			goto fail;
406 		}
407 	}
408 
409 	/*
410 	 * Ensure that all lgrp nodes see all the mblocks. However, if
411 	 * sub-page interleave is being fixed, they do not, so skip
412 	 * the check.
413 	 */
414 
415 	if (sub_page_fix == 0) {
416 		for (i = 0; i < n_lgrpnodes; i++) {
417 			j = md_alloc_scan_dag(md, mpo_lgroup[i].node,
418 			    PROP_LG_MBLOCK, "fwd", &nodes);
419 			md_free_scan_dag(md, &nodes);
420 			if (j != n_mblocks) {
421 				MPO_STATUS("lgrp_traverse: "
422 				    "sub-page interleave is being fixed\n");
423 				ret_val = -1;
424 				goto fail;
425 			}
426 		}
427 	}
428 
429 	/*
430 	 * Use the address mask from the first lgroup node
431 	 * to establish our home_mask.
432 	 */
433 	home_mask = mpo_lgroup[0].addr_mask;
434 	home_mask_pfn = btop(home_mask);
435 	home_mask_shift = lowbit(home_mask) - 1;
436 	home_mask_pfn_shift = home_mask_shift - PAGESHIFT;
437 	mnode_pages = btop(1ULL << home_mask_shift);
438 
439 	/*
440 	 * How many values are possible in home mask?  Assume the mask
441 	 * bits are contiguous.
442 	 */
443 	max_locality_groups =
444 	    1 << highbit(home_mask_pfn >> home_mask_pfn_shift);
445 
446 	/* Now verify the home mask bits are contiguous */
447 
448 	if (max_locality_groups - 1 != home_mask_pfn >> home_mask_pfn_shift) {
449 		MPO_STATUS("lgrp_traverse: "
450 		    "home mask bits are not contiguous\n");
451 		ret_val = -1;
452 		goto fail;
453 	}
454 
455 	/* Record all of the home bits */
456 
457 	for (i = 0; i < n_lgrpnodes; i++) {
458 		HOMESET_ADD(mem_lg_homeset,
459 		    mpo_lgroup[i].addr_match >> home_mask_shift);
460 	}
461 
462 	/* Count the number different "home"  mem_lg's we've discovered */
463 
464 	n_locality_groups = unique_home_mem_lg_count(mem_lg_homeset);
465 
466 	/* If we have only 1 locality group then we can exit */
467 	if (n_locality_groups == 1) {
468 		MPO_STATUS("lgrp_traverse: n_locality_groups == 1\n");
469 		ret_val = -1;
470 		goto fail;
471 	}
472 
473 	/*
474 	 * Set the latencies.  A CPU's lgroup is defined by the lowest
475 	 * latency found.  All other memory is considered remote, and the
476 	 * remote latency is represented by the highest latency found.
477 	 * Thus hierarchical lgroups, if any, are approximated by a
478 	 * two level scheme.
479 	 *
480 	 * The Solaris MPO framework by convention wants to see latencies
481 	 * in units of nano-sec/10. In the MD, the units are defined to be
482 	 * pico-seconds.
483 	 */
484 
485 	lower_latency = mpo_lgroup[0].latency;
486 	higher_latency = mpo_lgroup[0].latency;
487 
488 	for (i = 1; i < n_lgrpnodes; i++) {
489 		if (mpo_lgroup[i].latency < lower_latency) {
490 			lower_latency = mpo_lgroup[i].latency;
491 		}
492 		if (mpo_lgroup[i].latency > higher_latency) {
493 			higher_latency = mpo_lgroup[i].latency;
494 		}
495 	}
496 	lower_latency /= 10000;
497 	higher_latency /= 10000;
498 
499 	/* Clear our CPU data */
500 
501 	for (i = 0; i < NCPU; i++) {
502 		mpo_cpu[i].home = 0;
503 		mpo_cpu[i].latency = (uint_t)(-1);
504 	}
505 
506 	/* Build the CPU nodes */
507 	for (i = 0; i < n_cpunodes; i++) {
508 
509 		/* Read in the lgroup nodes */
510 
511 		result = get_int(md, cpunodes[i], PROP_LG_CPU_ID, &k);
512 		if (result < 0) {
513 			MPO_STATUS("lgrp_traverse: PROP_LG_CPU_ID missing\n");
514 			ret_val = -1;
515 			goto fail;
516 		}
517 
518 		n_lgroups = md_alloc_scan_dag(md, cpunodes[i], PROP_LG_MEM_LG,
519 		    "fwd", &nodes);
520 		if (n_lgroups <= 0) {
521 			MPO_STATUS("lgrp_traverse: PROP_LG_MEM_LG missing");
522 			ret_val = -1;
523 			goto fail;
524 		}
525 
526 		/*
527 		 * Find the lgroup this cpu belongs to with the lowest latency.
528 		 * Check all the lgrp nodes connected to this CPU to determine
529 		 * which has the smallest latency.
530 		 */
531 
532 		for (j = 0; j < n_lgroups; j++) {
533 			for (o = 0; o < n_lgrpnodes; o++) {
534 				if (nodes[j] == mpo_lgroup[o].node) {
535 					if (mpo_lgroup[o].latency <
536 					    mpo_cpu[k].latency) {
537 						mpo_cpu[k].home =
538 						    mpo_lgroup[o].addr_match
539 						    >> home_mask_shift;
540 						mpo_cpu[k].latency =
541 						    mpo_lgroup[o].latency;
542 						mpo_lgroup[o].ncpu++;
543 					}
544 				}
545 			}
546 		}
547 		md_free_scan_dag(md, &nodes);
548 	}
549 
550 	/* Validate that no large pages cross mnode boundaries. */
551 	if (valid_pages(md, cpunodes[0]) == 0) {
552 		ret_val = -1;
553 		goto fail;
554 	}
555 
556 fail:
557 	/* MD cookies are no longer valid; ensure they are not used again. */
558 	for (i = 0; i < n_mblocks; i++)
559 		mpo_mblock[i].node = MDE_INVAL_ELEM_COOKIE;
560 	for (i = 0; i < n_lgrpnodes; i++)
561 		mpo_lgroup[i].node = MDE_INVAL_ELEM_COOKIE;
562 
563 	if (n_cpunodes > 0)
564 		md_free_scan_dag(md, &cpunodes);
565 	if (n_lgrpnodes > 0)
566 		md_free_scan_dag(md, &lgrpnodes);
567 	if (n_mblocks > 0)
568 		md_free_scan_dag(md, &mblocknodes);
569 	else
570 		panic("lgrp_traverse: No memory blocks found");
571 
572 	if (ret_val == 0)
573 		MPO_STATUS("MPO feature is enabled.\n");
574 
575 	return (ret_val);
576 }
577 
578 /*
579  *  Determine the number of unique mem_lg's present in our system
580  */
581 static	int
582 unique_home_mem_lg_count(uint64_t mem_lg_homeset)
583 {
584 	int homeid;
585 	int count = 0;
586 
587 	/*
588 	 * Scan the "home" bits of the mem_lgs, count
589 	 * the number that are unique.
590 	 */
591 
592 	for (homeid = 0; homeid < NLGRPS_MAX; homeid++) {
593 		if (MEM_LG_ISMEMBER(mem_lg_homeset, homeid)) {
594 			count++;
595 		}
596 	}
597 
598 	MPO_DEBUG("unique_home_mem_lg_count: homeset %lx\n",
599 	    mem_lg_homeset);
600 	MPO_DEBUG("unique_home_mem_lg_count: count: %d\n", count);
601 
602 	/* Default must be at least one */
603 	if (count == 0)
604 		count = 1;
605 
606 	return (count);
607 }
608 
609 /*
610  * Platform specific lgroup initialization
611  */
612 void
613 plat_lgrp_init(void)
614 {
615 	md_t *md;
616 	int i, rc, ncpu_min;
617 
618 	/* Get the Machine Descriptor handle */
619 
620 	md = md_get_handle();
621 
622 	/* If not, we cannot continue */
623 
624 	if (md == NULL) {
625 		panic("cannot access machine descriptor\n");
626 	} else {
627 		rc = lgrp_traverse(md);
628 		(void) md_fini_handle(md);
629 	}
630 
631 	/*
632 	 * If we can't process the MD for lgroups then at least let the
633 	 * system try to boot.  Assume we have one lgroup so that
634 	 * when plat_build_mem_nodes is called, it will attempt to init
635 	 * an mnode based on the supplied memory segment.
636 	 */
637 
638 	if (rc == -1) {
639 		home_mask_pfn = 0;
640 		max_locality_groups = 1;
641 		n_locality_groups = 1;
642 		return;
643 	}
644 
645 	mem_node_pfn_shift = 0;
646 	mem_node_physalign = 0;
647 
648 	/* Use lgroup-aware TSB allocations */
649 	tsb_lgrp_affinity = 1;
650 
651 	/*
652 	 * lgrp_expand_proc_thresh is the minimum load on the lgroups
653 	 * this process is currently running on before considering
654 	 * expanding threads to another lgroup.
655 	 *
656 	 * lgrp_expand_proc_diff determines how much less the remote lgroup
657 	 * must be loaded before expanding to it.
658 	 *
659 	 * On sun4v CMT processors, threads share a core pipeline, and
660 	 * at less than 100% utilization, best throughput is obtained by
661 	 * spreading threads across more cores, even if some are in a
662 	 * different lgroup.  Spread threads to a new lgroup if the
663 	 * current group is more than 50% loaded.  Because of virtualization,
664 	 * lgroups may have different numbers of CPUs, but the tunables
665 	 * apply to all lgroups, so find the smallest lgroup and compute
666 	 * 50% loading.
667 	 */
668 
669 	ncpu_min = NCPU;
670 	for (i = 0; i < n_lgrpnodes; i++) {
671 		int ncpu = mpo_lgroup[i].ncpu;
672 		if (ncpu != 0 && ncpu < ncpu_min)
673 			ncpu_min = ncpu;
674 	}
675 	lgrp_expand_proc_thresh = ncpu_min * lgrp_loadavg_max_effect / 2;
676 
677 	/* new home may only be half as loaded as the existing home to use it */
678 	lgrp_expand_proc_diff = lgrp_expand_proc_thresh / 2;
679 
680 	lgrp_loadavg_tolerance = lgrp_loadavg_max_effect;
681 
682 	/* Require that a home lgroup have some memory to be chosen */
683 	lgrp_mem_free_thresh = 1;
684 
685 	/* Standard home-on-next-touch policy */
686 	lgrp_mem_policy_root = LGRP_MEM_POLICY_NEXT;
687 
688 	/* Disable option to choose root lgroup if all leaf lgroups are busy */
689 	lgrp_load_thresh = UINT32_MAX;
690 }
691 
692 /*
693  *  Helper routine for debugging calls to mem_node_add_slice()
694  */
695 static	void
696 mpo_mem_node_add_slice(pfn_t basepfn, pfn_t endpfn)
697 {
698 #if defined(DEBUG) && !defined(lint)
699 	static int slice_count = 0;
700 
701 	slice_count++;
702 	MPO_DEBUG("mem_add_slice(%d): basepfn: %lx  endpfn: %lx\n",
703 	    slice_count, basepfn, endpfn);
704 #endif
705 	mem_node_add_slice(basepfn, endpfn);
706 }
707 
708 /*
709  *  Helper routine for debugging calls to plat_assign_lgrphand_to_mem_node()
710  */
711 static	void
712 mpo_plat_assign_lgrphand_to_mem_node(lgrp_handle_t plathand, int mnode)
713 {
714 	MPO_DEBUG("plat_assign_to_mem_nodes: lgroup home %ld,"
715 	    "mnode index: %d\n", plathand, mnode);
716 	plat_assign_lgrphand_to_mem_node(plathand, mnode);
717 }
718 
719 /*
720  * plat_build_mem_nodes()
721  *
722  * Define the mem_nodes based on the modified boot memory list,
723  * or based on info read from the MD in plat_lgrp_init().
724  *
725  * When the home mask lies in the middle of the address bits (as it does on
726  * Victoria Falls), then the memory in one mem_node is no longer contiguous;
727  * it is striped across an mblock in a repeating pattern of contiguous memory
728  * followed by a gap.  The stripe width is the size of the contiguous piece.
729  * The stride is the distance from the start of one contiguous piece to the
730  * start of the next.  The gap is thus stride - stripe_width.
731  *
732  * The stripe of an mnode that falls within an mblock is described by the type
733  * mem_stripe_t, and there is one mem_stripe_t per mnode per mblock.  The
734  * mem_stripe_t's are kept in a global array mem_stripes[].  The index into
735  * this array is predetermined.  The mem_stripe_t that describes mnode m
736  * within mpo_mblock[i] is stored at
737  *	 mem_stripes[ m + i * max_locality_groups ]
738  *
739  * max_locality_groups is the total number of possible locality groups,
740  * as defined by the size of the home mask, even if the memory assigned
741  * to the domain is small and does not cover all the lgroups.  Thus some
742  * mem_stripe_t's may be empty.
743  *
744  * The members of mem_stripe_t are:
745  *	physbase: First valid page in mem_node in the corresponding mblock
746  *	physmax: Last valid page in mem_node in mblock
747  *	offset:  The full stripe width starts at physbase - offset.
748  *	    Thus if offset is non-zero, this mem_node starts in the middle
749  *	    of a stripe width, and the second full stripe starts at
750  *	    physbase - offset + stride.  (even though physmax may fall in the
751  *	    middle of a stripe width, we do not save the ending fragment size
752  *	    in this data structure.)
753  *	exists: Set to 1 if the mblock has memory in this mem_node stripe.
754  *
755  *	The stripe width is kept in the global mnode_pages.
756  *	The stride is kept in the global mnode_stride.
757  *	All the above use pfn's as the unit.
758  *
759  * As an example, the memory layout for a domain with 2 mblocks and 4
760  * mem_nodes 0,1,2,3 could look like this:
761  *
762  *	123012301230 ...	012301230123 ...
763  *	  mblock 0		  mblock 1
764  */
765 
766 void
767 plat_build_mem_nodes(u_longlong_t *list, size_t nelems)
768 {
769 	lgrp_handle_t lgrphand, lgrp_start;
770 	int i, mnode, elem;
771 	uint64_t offset, stripe_end, base, len, end, ra_to_pa, stride;
772 	uint64_t stripe, frag, remove;
773 	mem_stripe_t *ms;
774 
775 	/* Pre-reserve space for plat_assign_lgrphand_to_mem_node */
776 	max_mem_nodes = max_locality_groups;
777 
778 	/* Check for non-MPO sun4v platforms */
779 	if (n_locality_groups <= 1) {
780 		mpo_plat_assign_lgrphand_to_mem_node(LGRP_DEFAULT_HANDLE, 0);
781 		for (elem = 0; elem < nelems; elem += 2) {
782 			base = list[elem];
783 			len = list[elem+1];
784 
785 			mpo_mem_node_add_slice(btop(base),
786 			    btop(base + len - 1));
787 		}
788 		mem_node_pfn_shift = 0;
789 		mem_node_physalign = 0;
790 		n_mem_stripes = 0;
791 		if (n_mblocks == 1)
792 			return;
793 	}
794 
795 	bzero(mem_stripes, mstripesz);
796 	stripe = ptob(mnode_pages);
797 	stride = max_locality_groups * stripe;
798 
799 	/* Save commonly used values in globals */
800 	mnode_stride = btop(stride);
801 	n_mem_stripes = max_locality_groups * n_mblocks;
802 	stripe_shift = highbit(max_locality_groups) - 1;
803 
804 	for (i = 0; i < n_mblocks; i++) {
805 
806 		base = mpo_mblock[i].base;
807 		end = mpo_mblock[i].base + mpo_mblock[i].size;
808 		ra_to_pa = mpo_mblock[i].ra_to_pa;
809 		mpo_mblock[i].base_pfn = btop(base);
810 		mpo_mblock[i].end_pfn = btop(end - 1);
811 
812 		/* Find the offset from the prev stripe boundary in PA space. */
813 		offset = (base + ra_to_pa) & (stripe - 1);
814 
815 		/* Set the next stripe boundary. */
816 		stripe_end = base - offset + stripe;
817 
818 		lgrp_start = (((base + ra_to_pa) & home_mask) >>
819 		    home_mask_shift);
820 		lgrphand = lgrp_start;
821 
822 		/*
823 		 * Loop over all lgroups covered by the mblock, creating a
824 		 * stripe for each.  Stop when lgrp_start is visited again.
825 		 */
826 		do {
827 			/* mblock may not span all lgroups */
828 			if (base >= end)
829 				break;
830 
831 			mnode = lgrphand;
832 			ASSERT(mnode < max_mem_nodes);
833 			mpo_mblock[i].mnode_mask |= (mnodeset_t)1 << mnode;
834 
835 			/*
836 			 * Calculate the size of the fragment that does not
837 			 * belong to the mnode in the last partial stride.
838 			 */
839 			frag = (end - (base - offset)) & (stride - 1);
840 			if (frag == 0) {
841 				/* remove the gap */
842 				remove = stride - stripe;
843 			} else if (frag < stripe) {
844 				/* fragment fits in stripe; keep it all */
845 				remove = 0;
846 			} else {
847 				/* fragment is large; trim after whole stripe */
848 				remove = frag - stripe;
849 			}
850 
851 			ms = &mem_stripes[i * max_locality_groups + mnode];
852 			ms->physbase = btop(base);
853 			ms->physmax = btop(end - 1 - remove);
854 			ms->offset = btop(offset);
855 			ms->exists = 1;
856 
857 			/*
858 			 * If we have only 1 lgroup and multiple mblocks,
859 			 * then we have already established our lgrp handle
860 			 * to mem_node and mem_node_config values above.
861 			 */
862 			if (n_locality_groups > 1) {
863 				mpo_plat_assign_lgrphand_to_mem_node(lgrphand,
864 				    mnode);
865 				mpo_mem_node_add_slice(ms->physbase,
866 				    ms->physmax);
867 			}
868 			base = stripe_end;
869 			stripe_end += stripe;
870 			offset = 0;
871 			lgrphand = (((base + ra_to_pa) & home_mask) >>
872 			    home_mask_shift);
873 		} while (lgrphand != lgrp_start);
874 	}
875 
876 	/*
877 	 * Indicate to vm_pagelist that the hpm_counters array
878 	 * should be shared because the ranges overlap.
879 	 */
880 	if (max_mem_nodes > 1) {
881 		interleaved_mnodes = 1;
882 	}
883 }
884 
885 /*
886  * Return the locality group value for the supplied processor
887  */
888 lgrp_handle_t
889 plat_lgrp_cpu_to_hand(processorid_t id)
890 {
891 	if (n_locality_groups > 1) {
892 		return ((lgrp_handle_t)mpo_cpu[(int)id].home);
893 	} else {
894 		return ((lgrp_handle_t)LGRP_DEFAULT_HANDLE); /* Default */
895 	}
896 }
897 
898 int
899 plat_lgrp_latency(lgrp_handle_t from, lgrp_handle_t to)
900 {
901 	/*
902 	 * Return min remote latency when there are more than two lgroups
903 	 * (root and child) and getting latency between two different lgroups
904 	 * or root is involved.
905 	 */
906 	if (lgrp_optimizations() && (from != to ||
907 	    from == LGRP_DEFAULT_HANDLE || to == LGRP_DEFAULT_HANDLE)) {
908 		return ((int)higher_latency);
909 	} else {
910 		return ((int)lower_latency);
911 	}
912 }
913 
914 int
915 plat_pfn_to_mem_node(pfn_t pfn)
916 {
917 	int i, mnode;
918 	pfn_t ra_to_pa_pfn;
919 	struct mblock_md *mb;
920 
921 	if (n_locality_groups <= 1)
922 		return (0);
923 
924 	/*
925 	 * The mnode is defined to be 1:1 with the lgroup handle, which
926 	 * is taken from from the home bits.  Find the mblock in which
927 	 * the pfn falls to get the ra_to_pa adjustment, and extract
928 	 * the home bits.
929 	 */
930 	mb = &mpo_mblock[0];
931 	for (i = 0; i < n_mblocks; i++) {
932 		if (pfn >= mb->base_pfn && pfn <= mb->end_pfn) {
933 			ra_to_pa_pfn = btop(mb->ra_to_pa);
934 			mnode = (((pfn + ra_to_pa_pfn) & home_mask_pfn) >>
935 			    home_mask_pfn_shift);
936 			ASSERT(mnode < max_mem_nodes);
937 			return (mnode);
938 		}
939 		mb++;
940 	}
941 
942 	panic("plat_pfn_to_mem_node() failed to find mblock: pfn=%lx\n", pfn);
943 	return (pfn);
944 }
945 
946 /*
947  * plat_rapfn_to_papfn
948  *
949  * Convert a pfn in RA space to a pfn in PA space, in which the page coloring
950  * and home mask bits are correct.  The upper bits do not necessarily
951  * match the actual PA, however.
952  */
953 pfn_t
954 plat_rapfn_to_papfn(pfn_t pfn)
955 {
956 	int i;
957 	pfn_t ra_to_pa_pfn;
958 	struct mblock_md *mb;
959 
960 	ASSERT(n_mblocks > 0);
961 	if (n_mblocks == 1)
962 		return (pfn + base_ra_to_pa_pfn);
963 
964 	/*
965 	 * Find the mblock in which the pfn falls
966 	 * in order to get the ra_to_pa adjustment.
967 	 */
968 	for (mb = &mpo_mblock[0], i = 0; i < n_mblocks; i++, mb++) {
969 		if (pfn <= mb->end_pfn && pfn >= mb->base_pfn) {
970 			ra_to_pa_pfn = btop(mb->ra_to_pa);
971 			return (pfn + ra_to_pa_pfn);
972 		}
973 	}
974 
975 	panic("plat_rapfn_to_papfn() failed to find mblock: pfn=%lx\n", pfn);
976 	return (pfn);
977 }
978 
979 /*
980  * plat_mem_node_iterator_init()
981  *	Initialize cookie to iterate over pfn's in an mnode.  There is
982  *	no additional iterator function.  The caller uses the info from
983  *	the iterator structure directly.
984  *
985  *	pfn: starting pfn.
986  * 	mnode: desired mnode.
987  *	init: set to 1 for full init, 0 for continuation
988  *
989  *	Returns the appropriate starting pfn for the iteration
990  *	the same as the input pfn if it falls in an mblock.
991  *	Returns the (pfn_t)-1 value if the input pfn lies past
992  *	the last valid mnode pfn.
993  */
994 pfn_t
995 plat_mem_node_iterator_init(pfn_t pfn, int mnode,
996     mem_node_iterator_t *it, int init)
997 {
998 	int i;
999 	struct mblock_md *mblock;
1000 	pfn_t base, end;
1001 
1002 	ASSERT(it != NULL);
1003 	ASSERT(mnode >= 0 && mnode < max_mem_nodes);
1004 	ASSERT(n_mblocks > 0);
1005 
1006 	if (init) {
1007 		it->mi_last_mblock = 0;
1008 		it->mi_init = 1;
1009 	}
1010 
1011 	/* Check if mpo is not enabled and we only have one mblock */
1012 	if (n_locality_groups == 1 && n_mblocks == 1) {
1013 		it->mi_mnode = mnode;
1014 		it->mi_ra_to_pa = base_ra_to_pa_pfn;
1015 		it->mi_mnode_pfn_mask = 0;
1016 		it->mi_mnode_pfn_shift = 0;
1017 		it->mi_mnode_mask = 0;
1018 		it->mi_mblock_base = mem_node_config[mnode].physbase;
1019 		it->mi_mblock_end = mem_node_config[mnode].physmax;
1020 		if (pfn < it->mi_mblock_base)
1021 			pfn = it->mi_mblock_base;
1022 		else if (pfn > it->mi_mblock_end)
1023 			pfn = (pfn_t)-1;
1024 		return (pfn);
1025 	}
1026 
1027 	/*
1028 	 * Find mblock that contains pfn, or first mblock after pfn,
1029 	 * else pfn is out of bounds, so use the last mblock.
1030 	 * mblocks are sorted in ascending address order.
1031 	 */
1032 	ASSERT(it->mi_last_mblock < n_mblocks);
1033 	ASSERT(init == 1 || pfn > mpo_mblock[it->mi_last_mblock].end_pfn);
1034 	i = init ? 0 : it->mi_last_mblock + 1;
1035 	if (i == n_mblocks)
1036 		return ((pfn_t)-1);
1037 
1038 	for (; i < n_mblocks; i++) {
1039 		if ((mpo_mblock[i].mnode_mask & ((mnodeset_t)1 << mnode)) &&
1040 		    (pfn <= mpo_mblock[i].end_pfn))
1041 			break;
1042 	}
1043 	if (i == n_mblocks) {
1044 		it->mi_last_mblock = i - 1;
1045 		return ((pfn_t)-1);
1046 	}
1047 	it->mi_last_mblock = i;
1048 
1049 	/*
1050 	 * Memory stripes are defined if there is more than one locality
1051 	 * group, so use the stripe bounds.  Otherwise use mblock bounds.
1052 	 */
1053 	mblock = &mpo_mblock[i];
1054 	if (n_mem_stripes > 0) {
1055 		mem_stripe_t *ms =
1056 		    &mem_stripes[i * max_locality_groups + mnode];
1057 		base = ms->physbase;
1058 		end = ms->physmax;
1059 	} else {
1060 		ASSERT(mnode == 0);
1061 		base = mblock->base_pfn;
1062 		end = mblock->end_pfn;
1063 	}
1064 
1065 	it->mi_mnode = mnode;
1066 	it->mi_ra_to_pa = btop(mblock->ra_to_pa);
1067 	it->mi_mblock_base = base;
1068 	it->mi_mblock_end = end;
1069 	it->mi_mnode_pfn_mask = home_mask_pfn;	/* is 0 for non-MPO case */
1070 	it->mi_mnode_pfn_shift = home_mask_pfn_shift;
1071 	it->mi_mnode_mask = max_locality_groups - 1;
1072 	if (pfn < base)
1073 		pfn = base;
1074 	else if (pfn > end)
1075 		pfn = (pfn_t)-1;
1076 	return (pfn);
1077 }
1078 
1079 /*
1080  * plat_mem_node_intersect_range()
1081  *
1082  * Find the intersection between a memnode and a range of pfn's.
1083  */
1084 void
1085 plat_mem_node_intersect_range(pfn_t test_base, pgcnt_t test_len,
1086     int mnode, pgcnt_t *npages_out)
1087 {
1088 	pfn_t offset, len, hole, base, end, test_end, frag;
1089 	pfn_t nearest;
1090 	mem_stripe_t *ms;
1091 	int i, npages;
1092 
1093 	*npages_out = 0;
1094 
1095 	if (!mem_node_config[mnode].exists || test_len == 0)
1096 		return;
1097 
1098 	base = mem_node_config[mnode].physbase;
1099 	end = mem_node_config[mnode].physmax;
1100 
1101 	test_end = test_base + test_len - 1;
1102 	if (end < test_base || base > test_end)
1103 		return;
1104 
1105 	if (n_locality_groups == 1) {
1106 		*npages_out = MIN(test_end, end) - MAX(test_base, base) + 1;
1107 		return;
1108 	}
1109 
1110 	hole = mnode_stride - mnode_pages;
1111 	npages = 0;
1112 
1113 	/*
1114 	 * Iterate over all the stripes for this mnode (one per mblock),
1115 	 * find the intersection with each, and accumulate the intersections.
1116 	 *
1117 	 * Determing the intersection with a stripe is tricky.  If base or end
1118 	 * fall outside the mem_node bounds, round them to physbase/physmax of
1119 	 * mem_node.  If base or end fall in a gap, round them to start of
1120 	 * nearest stripe.  If they fall within a stripe, keep base or end,
1121 	 * but calculate the fragment size that should be excluded from the
1122 	 * stripe.  Calculate how many strides fall in the adjusted range,
1123 	 * multiply by stripe width, and add the start and end fragments.
1124 	 */
1125 
1126 	for (i = mnode; i < n_mem_stripes; i += max_locality_groups) {
1127 		ms = &mem_stripes[i];
1128 		if (ms->exists &&
1129 		    test_base <= (end = ms->physmax) &&
1130 		    test_end >= (base = ms->physbase)) {
1131 
1132 			offset = ms->offset;
1133 
1134 			if (test_base > base) {
1135 				/* Round test_base to next multiple of stride */
1136 				len = P2ROUNDUP(test_base - (base - offset),
1137 				    mnode_stride);
1138 				nearest = base - offset + len;
1139 				/*
1140 				 * Compute distance from test_base to the
1141 				 * stride boundary to see if test_base falls
1142 				 * in the stripe or in the hole.
1143 				 */
1144 				if (nearest - test_base > hole) {
1145 					/*
1146 					 * test_base lies in stripe,
1147 					 * and offset should be excluded.
1148 					 */
1149 					offset = test_base -
1150 					    (nearest - mnode_stride);
1151 					base = test_base;
1152 				} else {
1153 					/* round up to next stripe start */
1154 					offset = 0;
1155 					base = nearest;
1156 					if (base > end)
1157 						continue;
1158 				}
1159 
1160 			}
1161 
1162 			if (test_end < end)
1163 				end = test_end;
1164 			end++;		/* adjust to an exclusive bound */
1165 
1166 			/* Round end to next multiple of stride */
1167 			len = P2ROUNDUP(end - (base - offset), mnode_stride);
1168 			nearest = (base - offset) + len;
1169 			if (nearest - end <= hole) {
1170 				/* end falls in hole, use entire last stripe */
1171 				frag = 0;
1172 			} else {
1173 				/* end falls in stripe, compute fragment */
1174 				frag = nearest - hole - end;
1175 			}
1176 
1177 			len = (len >> stripe_shift) - offset - frag;
1178 			npages += len;
1179 		}
1180 	}
1181 
1182 	*npages_out = npages;
1183 }
1184 
1185 /*
1186  * valid_pages()
1187  *
1188  * Return 1 if pages are valid and do not cross mnode boundaries
1189  * (which would break page free list assumptions), and 0 otherwise.
1190  */
1191 
1192 #define	MNODE(pa)	\
1193 	((btop(pa) & home_mask_pfn) >> home_mask_pfn_shift)
1194 
1195 static int
1196 valid_pages(md_t *md, mde_cookie_t cpu0)
1197 {
1198 	int i, max_szc;
1199 	uint64_t last_page_base, szc_mask;
1200 	uint64_t max_page_len, max_coalesce_len;
1201 	struct mblock_md *mb = mpo_mblock;
1202 
1203 	/*
1204 	 * Find the smaller of the largest page possible and supported.
1205 	 * mmu_exported_pagesize_mask is not yet initialized, so read
1206 	 * it from the MD.  Apply minimal fixups in case of broken MDs
1207 	 * to get a sane mask.
1208 	 */
1209 
1210 	if (md_get_prop_val(md, cpu0, "mmu-page-size-list", &szc_mask))
1211 		szc_mask = 0;
1212 	szc_mask |=  (1 << TTE4M);	/* largest in sun4v default support */
1213 	max_szc = highbit(szc_mask) - 1;
1214 	if (max_szc > TTE256M)
1215 		max_szc = TTE256M;
1216 	max_page_len = TTEBYTES(max_szc);
1217 
1218 	/*
1219 	 * Page coalescing code coalesces all sizes up to 256M on sun4v, even
1220 	 * if mmu-page-size-list does not contain it, so 256M pages must fall
1221 	 * within one mnode to use MPO.
1222 	 */
1223 	max_coalesce_len = TTEBYTES(TTE256M);
1224 	ASSERT(max_coalesce_len >= max_page_len);
1225 
1226 	if (ptob(mnode_pages) < max_coalesce_len) {
1227 		MPO_STATUS("Page too large; MPO disabled: page = %lx, "
1228 		    "mnode slice = %lx\n", max_coalesce_len, ptob(mnode_pages));
1229 		return (0);
1230 	}
1231 
1232 	for (i = 0; i < n_mblocks; i++) {
1233 		uint64_t base = mb->base;
1234 		uint64_t end = mb->base + mb->size - 1;
1235 		uint64_t ra_to_pa = mb->ra_to_pa;
1236 
1237 		/*
1238 		 * If mblock is smaller than the max page size, then
1239 		 * RA = PA mod MAXPAGE is not guaranteed, but it must
1240 		 * not span mnodes.
1241 		 */
1242 		if (mb->size < max_page_len) {
1243 			if (MNODE(base + ra_to_pa) != MNODE(end + ra_to_pa)) {
1244 				MPO_STATUS("Small mblock spans mnodes; "
1245 				    "MPO disabled: base = %lx, end = %lx, "
1246 				    "ra2pa = %lx\n", base, end, ra_to_pa);
1247 				return (0);
1248 			}
1249 		} else {
1250 			/* Verify RA = PA mod MAXPAGE, using coalesce size */
1251 			uint64_t pa_base = base + ra_to_pa;
1252 			if ((base & (max_coalesce_len - 1)) !=
1253 			    (pa_base & (max_coalesce_len - 1))) {
1254 				MPO_STATUS("bad page alignment; MPO disabled: "
1255 				    "ra = %lx, pa = %lx, pagelen = %lx\n",
1256 				    base, pa_base, max_coalesce_len);
1257 				return (0);
1258 			}
1259 		}
1260 
1261 		/*
1262 		 * Find start of last large page in mblock in RA space.
1263 		 * If page extends into the next mblock, verify the
1264 		 * mnode does not change.
1265 		 */
1266 		last_page_base = P2ALIGN(end, max_coalesce_len);
1267 		if (i + 1 < n_mblocks &&
1268 		    last_page_base + max_coalesce_len > mb[1].base &&
1269 		    MNODE(last_page_base + ra_to_pa) !=
1270 		    MNODE(mb[1].base + mb[1].ra_to_pa)) {
1271 			MPO_STATUS("Large page spans mblocks; MPO disabled: "
1272 			    "end = %lx, ra2pa = %lx, base = %lx, ra2pa = %lx, "
1273 			    "pagelen = %lx\n", end, ra_to_pa, mb[1].base,
1274 			    mb[1].ra_to_pa, max_coalesce_len);
1275 			return (0);
1276 		}
1277 
1278 		mb++;
1279 	}
1280 	return (1);
1281 }
1282 
1283 
1284 /*
1285  * fix_interleave() - Find lgroups with sub-page sized memory interleave,
1286  * if any, and remove them.  This yields a config where the "coarse
1287  * grained" lgroups cover all of memory, even though part of that memory
1288  * is fine grain interleaved and does not deliver a purely local memory
1289  * latency.
1290  *
1291  * This function reads and modifies the globals:
1292  *	mpo_lgroup[], n_lgrpnodes
1293  *
1294  * Returns 1 if lgroup nodes were removed, 0 otherwise.
1295  */
1296 
1297 static int
1298 fix_interleave(void)
1299 {
1300 	int i, j;
1301 	uint64_t mask = 0;
1302 
1303 	j = 0;
1304 	for (i = 0; i < n_lgrpnodes; i++) {
1305 		if ((mpo_lgroup[i].addr_mask & PAGEOFFSET) != 0) {
1306 			/* remove this lgroup */
1307 			mask = mpo_lgroup[i].addr_mask;
1308 		} else {
1309 			mpo_lgroup[j++] = mpo_lgroup[i];
1310 		}
1311 	}
1312 	n_lgrpnodes = j;
1313 
1314 	if (mask != 0)
1315 		MPO_STATUS("sub-page interleave %lx found; "
1316 		    "removing lgroup.\n", mask);
1317 
1318 	return (mask != 0);
1319 }
1320